wikontic 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. wikontic/__init__.py +16 -0
  2. wikontic/create_ontological_triplets_db.py +193 -0
  3. wikontic/create_triplets_db.py +259 -0
  4. wikontic/create_wikidata_ontology_db.py +555 -0
  5. wikontic/utils/__init__.py +7 -0
  6. wikontic/utils/base_inference_with_db.py +329 -0
  7. wikontic/utils/dynamic_aligner.py +281 -0
  8. wikontic/utils/inference_with_db.py +224 -0
  9. wikontic/utils/ontology_mappings/entity_hierarchy.json +1 -0
  10. wikontic/utils/ontology_mappings/entity_names.json +1 -0
  11. wikontic/utils/ontology_mappings/entity_type2aliases.json +1 -0
  12. wikontic/utils/ontology_mappings/entity_type2hierarchy.json +1 -0
  13. wikontic/utils/ontology_mappings/entity_type2label.json +1 -0
  14. wikontic/utils/ontology_mappings/enum_entity_ids.json +1 -0
  15. wikontic/utils/ontology_mappings/enum_prop_ids.json +1 -0
  16. wikontic/utils/ontology_mappings/label2entity.json +1 -0
  17. wikontic/utils/ontology_mappings/obj_constraint2prop.json +1 -0
  18. wikontic/utils/ontology_mappings/prop2aliases.json +1 -0
  19. wikontic/utils/ontology_mappings/prop2constraints.json +1 -0
  20. wikontic/utils/ontology_mappings/prop2data_type.json +1 -0
  21. wikontic/utils/ontology_mappings/prop2label.json +1 -0
  22. wikontic/utils/ontology_mappings/propid2enum.json +1 -0
  23. wikontic/utils/ontology_mappings/subj_constraint2prop.json +1 -0
  24. wikontic/utils/ontology_mappings/subject_object_constraints.json +1 -0
  25. wikontic/utils/openai_utils.py +517 -0
  26. wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types.txt +17 -0
  27. wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types_dialog_bench.txt +18 -0
  28. wikontic/utils/prompts/name_refinement/rank_object_names.txt +17 -0
  29. wikontic/utils/prompts/name_refinement/rank_object_names_dialog_bench.txt +18 -0
  30. wikontic/utils/prompts/name_refinement/rank_object_qualifiers.txt +20 -0
  31. wikontic/utils/prompts/name_refinement/rank_subject_names.txt +18 -0
  32. wikontic/utils/prompts/name_refinement/rank_subject_names_dialog_bench.txt +20 -0
  33. wikontic/utils/prompts/ontology_refinement/prompt_choose_entity_types.txt +26 -0
  34. wikontic/utils/prompts/ontology_refinement/prompt_choose_relation.txt +24 -0
  35. wikontic/utils/prompts/ontology_refinement/prompt_choose_relation_and_types.txt +28 -0
  36. wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question.txt +17 -0
  37. wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question_wo_types.txt +16 -0
  38. wikontic/utils/prompts/qa/prompt_entity_extraction_from_question.txt +3 -0
  39. wikontic/utils/prompts/qa/prompt_is_answered.txt +43 -0
  40. wikontic/utils/prompts/qa/qa_collapsing_prompt.txt +22 -0
  41. wikontic/utils/prompts/qa/qa_prompt.txt +5 -0
  42. wikontic/utils/prompts/qa/qa_prompt_hotpot.txt +6 -0
  43. wikontic/utils/prompts/qa/question_decomposition_1.txt +7 -0
  44. wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt +75 -0
  45. wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench_in_russian.txt +78 -0
  46. wikontic/utils/prompts/triplet_extraction/propmt_1_types_qualifiers.txt +91 -0
  47. wikontic/utils/structured_aligner.py +606 -0
  48. wikontic/utils/structured_inference_with_db.py +561 -0
  49. wikontic-0.0.3.dist-info/METADATA +111 -0
  50. wikontic-0.0.3.dist-info/RECORD +53 -0
  51. wikontic-0.0.3.dist-info/WHEEL +5 -0
  52. wikontic-0.0.3.dist-info/licenses/LICENSE +19 -0
  53. wikontic-0.0.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,561 @@
1
+ from unidecode import unidecode
2
+ import re
3
+ import warnings
4
+ from typing import Dict, List, Tuple
5
+ from langchain.tools import tool
6
+ import logging
7
+
8
+ from .base_inference_with_db import BaseInferenceWithDB
9
+
10
+ warnings.filterwarnings("ignore")
11
+ logger = logging.getLogger("StructuredInferenceWithDB")
12
+ logger.setLevel(logging.ERROR)
13
+
14
+
15
+ class StructuredInferenceWithDB(BaseInferenceWithDB):
16
+ def __init__(self, extractor, aligner, triplets_db):
17
+ self.extractor = extractor
18
+ self.aligner = aligner
19
+ self.triplets_db = triplets_db
20
+
21
+ self.extract_triplets_with_ontology_filtering_tool = tool(
22
+ self.extract_triplets_with_ontology_filtering
23
+ )
24
+ self.extract_triplets_with_ontology_filtering_and_add_to_db_tool = tool(
25
+ self.extract_triplets_with_ontology_filtering_and_add_to_db
26
+ )
27
+ self.retrieve_similar_entity_names_tool = tool(
28
+ self.retrieve_similar_entity_names
29
+ )
30
+ self.identify_relevant_entities_from_question_tool = tool(
31
+ self.identify_relevant_entities_from_question_with_llm
32
+ )
33
+ self.get_1_hop_supporting_triplets_tool = tool(
34
+ self.get_1_hop_supporting_triplets
35
+ )
36
+ # 1st step extraction without database
37
+
38
+ def _refine_entity_types(self, text, triplet):
39
+ """
40
+ Refine entity types using LLM.
41
+ """
42
+ candidate_subj_type_ids, candidate_obj_type_ids = (
43
+ self.aligner.retrieve_similar_entity_types(triplet=triplet)
44
+ )
45
+
46
+ candidate_entity_type_id_2_label = self.aligner.retrieve_entity_type_labels(
47
+ candidate_subj_type_ids + candidate_obj_type_ids
48
+ )
49
+
50
+ candidate_entity_type_label_2_id = {
51
+ entity_label: entity_id
52
+ for entity_id, entity_label in candidate_entity_type_id_2_label.items()
53
+ }
54
+
55
+ candidate_subject_types = [
56
+ candidate_entity_type_id_2_label[t] for t in candidate_subj_type_ids
57
+ ]
58
+ candidate_object_types = [
59
+ candidate_entity_type_id_2_label[t] for t in candidate_obj_type_ids
60
+ ]
61
+
62
+ # no need to refine if the triplet's types are in the candidate types
63
+ if (
64
+ triplet["subject_type"] in candidate_subject_types
65
+ and triplet["object_type"] in candidate_object_types
66
+ ):
67
+ refined_subject_type, refined_object_type = (
68
+ triplet["subject_type"],
69
+ triplet["object_type"],
70
+ )
71
+ refined_subject_type_id = candidate_entity_type_label_2_id[
72
+ triplet["subject_type"]
73
+ ]
74
+ refined_object_type_id = candidate_entity_type_label_2_id[
75
+ triplet["object_type"]
76
+ ]
77
+
78
+ else:
79
+ # if the triplet's subject type is in the candidate types,
80
+ # then only refine the subject type
81
+ if triplet["subject_type"] in candidate_subject_types:
82
+ candidate_subject_types = [triplet["subject_type"]]
83
+ # if the triplet's object type is in the candidate types,
84
+ # then only refine the object type
85
+ if triplet["object_type"] in candidate_object_types:
86
+ candidate_object_types = [triplet["object_type"]]
87
+
88
+ self.extractor.reset_error_state()
89
+ refined_entity_types = self.extractor.refine_entity_types(
90
+ text=text,
91
+ triplet=triplet,
92
+ candidate_subject_types=candidate_subject_types,
93
+ candidate_object_types=candidate_object_types,
94
+ )
95
+ refined_subject_type, refined_object_type = (
96
+ refined_entity_types["subject_type"],
97
+ refined_entity_types["object_type"],
98
+ )
99
+
100
+ refined_subject_type_id = (
101
+ candidate_entity_type_label_2_id[refined_subject_type]
102
+ if refined_subject_type in candidate_subject_types
103
+ else None
104
+ )
105
+
106
+ refined_object_type_id = (
107
+ candidate_entity_type_label_2_id[refined_object_type]
108
+ if refined_object_type in candidate_object_types
109
+ else None
110
+ )
111
+
112
+ return (
113
+ refined_subject_type,
114
+ refined_subject_type_id,
115
+ refined_object_type,
116
+ refined_object_type_id,
117
+ )
118
+
119
+ def _get_candidate_entity_properties(
120
+ self, triplet: Dict[str, str], subj_type_ids: List[str], obj_type_ids: List[str]
121
+ ) -> Tuple[List[Tuple[str, str]], Dict[str, dict]]:
122
+ """
123
+ Retrieve candidate properties and their labels/constraints.
124
+ """
125
+ # Get the list of tuples (<property_id>, <property_direction>)
126
+ properties: List[Tuple[str, str]] = (
127
+ self.aligner.retrieve_properties_for_entity_type(
128
+ target_relation=triplet["relation"],
129
+ object_types=obj_type_ids,
130
+ subject_types=subj_type_ids,
131
+ k=10,
132
+ )
133
+ )
134
+ # Get dict {<prop_id>:
135
+ # {"label": <prop_label>,
136
+ # "valid_subject_type_ids": <valid_subject_type_ids>,
137
+ # "valid_object_type_ids": <valid_object_type_ids>}}
138
+ prop_2_label_and_constraint = (
139
+ self.aligner.retrieve_properties_labels_and_constraints(
140
+ property_id_list=[p[0] for p in properties]
141
+ )
142
+ )
143
+ return properties, prop_2_label_and_constraint
144
+
145
+ def _refine_relation(
146
+ self, text, triplet, refined_subject_type_id, refined_object_type_id
147
+ ):
148
+ """
149
+ Refine relation using LLM.
150
+ """
151
+ # if refined subject and object types are in the candidate types,
152
+ # then refine the relation
153
+ if refined_subject_type_id and refined_object_type_id:
154
+ relation_direction_candidate_pairs, prop_2_label_and_constraint = (
155
+ self._get_candidate_entity_properties(
156
+ triplet=triplet,
157
+ subj_type_ids=[refined_subject_type_id],
158
+ obj_type_ids=[refined_object_type_id],
159
+ )
160
+ )
161
+ candidate_relations = [
162
+ prop_2_label_and_constraint[p[0]]["label"]
163
+ for p in relation_direction_candidate_pairs
164
+ ]
165
+ # no need to refine
166
+ # if the triplet's relation is in the candidate relations
167
+ if triplet["relation"] in candidate_relations:
168
+ refined_relation = triplet["relation"]
169
+ else:
170
+ self.extractor.reset_error_state()
171
+ refined_relation = self.extractor.refine_relation(
172
+ text=text, triplet=triplet, candidate_relations=candidate_relations
173
+ )["relation"]
174
+ # if refined subject and object types are not in the candidate types,
175
+ # leave relation as it is
176
+ else:
177
+ refined_relation = triplet["relation"]
178
+ candidate_relations = []
179
+
180
+ # if refined relation is in the candidate relations,
181
+ # then identify the relation direction
182
+ if refined_relation in candidate_relations:
183
+ refined_relation_id_candidates = [
184
+ p_id
185
+ for p_id in prop_2_label_and_constraint
186
+ if prop_2_label_and_constraint[p_id]["label"] == refined_relation
187
+ ]
188
+ refined_relation_id = refined_relation_id_candidates[0]
189
+ refined_relation_directions = [
190
+ p[1]
191
+ for p in relation_direction_candidate_pairs
192
+ if p[0] == refined_relation_id
193
+ ]
194
+ refined_relation_direction = (
195
+ "direct" if "direct" in refined_relation_directions else "inverse"
196
+ )
197
+
198
+ prop_subject_type_ids = [
199
+ prop_2_label_and_constraint[prop]["valid_subject_type_ids"]
200
+ for prop in prop_2_label_and_constraint
201
+ if prop_2_label_and_constraint[prop]["label"] == refined_relation
202
+ ][0]
203
+ prop_object_type_ids = [
204
+ prop_2_label_and_constraint[prop]["valid_object_type_ids"]
205
+ for prop in prop_2_label_and_constraint
206
+ if prop_2_label_and_constraint[prop]["label"] == refined_relation
207
+ ][0]
208
+
209
+ else:
210
+ refined_relation_direction = "direct"
211
+ refined_relation_id = None
212
+ prop_subject_type_ids = []
213
+ prop_object_type_ids = []
214
+
215
+ return (
216
+ refined_relation,
217
+ refined_relation_id,
218
+ refined_relation_direction,
219
+ prop_subject_type_ids,
220
+ prop_object_type_ids,
221
+ )
222
+
223
+ def _validate_backbone(
224
+ self,
225
+ refined_subject_type: str,
226
+ refined_object_type: str,
227
+ refined_relation: str,
228
+ refined_object_type_id: str,
229
+ refined_subject_type_id: str,
230
+ refined_relation_id: str,
231
+ valid_subject_type_ids: List[str],
232
+ valid_object_type_ids: List[str],
233
+ ):
234
+ """
235
+ Check if the selected backbone_triplet's types and relation are in the valid sets.
236
+ """
237
+
238
+ exception_msg = ""
239
+ if not refined_relation_id:
240
+ exception_msg += "Refined relation not in candidate relations\n"
241
+ if not refined_subject_type_id:
242
+ exception_msg += "Refined subject type not in candidate subject types\n"
243
+ if not refined_object_type_id:
244
+ exception_msg += "Refined object type not in candidate object types\n"
245
+
246
+ if exception_msg != "":
247
+ return False, exception_msg
248
+
249
+ else:
250
+
251
+ subject_type_hierarchy = self.aligner.retrieve_entity_type_hierarchy(
252
+ refined_subject_type
253
+ )
254
+ object_type_hierarchy = self.aligner.retrieve_entity_type_hierarchy(
255
+ refined_object_type
256
+ )
257
+
258
+ if valid_subject_type_ids == ["ANY"]:
259
+ valid_subject_type_ids = subject_type_hierarchy
260
+ if valid_object_type_ids == ["ANY"]:
261
+ valid_object_type_ids = object_type_hierarchy
262
+
263
+ if any(
264
+ [t in subject_type_hierarchy for t in valid_subject_type_ids]
265
+ ) and any([t in object_type_hierarchy for t in valid_object_type_ids]):
266
+ return True, exception_msg
267
+ else:
268
+ exception_msg += "Triplet backbone violates property constraints\n"
269
+ return False, exception_msg
270
+
271
+ def _refine_entity_name(self, text, triplet, sample_id, is_object=False):
272
+ """
273
+ Refine entity names using type constraints.
274
+ """
275
+ self.extractor.reset_error_state()
276
+ if is_object:
277
+ entity = unidecode(triplet["object"])
278
+ entity_type = triplet["object_type"]
279
+ entity_hierarchy = self.aligner.retrieve_entity_type_hierarchy(entity_type)
280
+ else:
281
+ entity = unidecode(triplet["subject"])
282
+ entity_type = triplet["subject_type"]
283
+ entity_hierarchy = []
284
+
285
+ # do not change time or quantity entities (of objects!)
286
+ if any([t in ["Q186408", "Q309314"] for t in entity_hierarchy]):
287
+ updated_entity = entity
288
+ else:
289
+ # if not time or quantity entities -> retrieve similar entities by type and name similarity
290
+ similar_entities = self.aligner.retrieve_entity_by_type(
291
+ entity_name=entity, entity_type=entity_type, sample_id=sample_id
292
+ )
293
+ # if there are similar entities -> refine entity name
294
+ if len(similar_entities) > 0:
295
+ # if exact match found -> return the exact match
296
+ if entity in similar_entities:
297
+ updated_entity = similar_entities[entity]
298
+ else:
299
+ # if not exact match -> refine entity name
300
+ updated_entity = self.extractor.refine_entity(
301
+ text=text,
302
+ triplet=triplet,
303
+ candidates=list(similar_entities.values()),
304
+ is_object=is_object,
305
+ )
306
+ # unidecode the updated entity
307
+ updated_entity = unidecode(updated_entity)
308
+ # if the updated entity is None (meaning that LLM didn't find any similar entities)
309
+ # -> return the original entity
310
+ if re.sub(r"[^\w\s]", "", updated_entity) == "None":
311
+ updated_entity = entity
312
+ else:
313
+ # if no similar entities -> return the original entity
314
+ updated_entity = entity
315
+
316
+ self.aligner.add_entity(
317
+ entity_name=updated_entity,
318
+ alias=entity,
319
+ entity_type=entity_type,
320
+ sample_id=sample_id,
321
+ )
322
+
323
+ return updated_entity
324
+
325
+ def extract_triplets_with_ontology_filtering(
326
+ self, text, sample_id=None, source_text_id=None
327
+ ):
328
+ """
329
+ Extract and refine knowledge graph triplets from text using LLM.
330
+
331
+ Args:
332
+ text (str): Input text to extract triplets from
333
+ sample_id (str): Sample ID - used to distinguish graphs resulted
334
+ in different launches/by different users
335
+ source_text_id (str): Optional, - used to distinguish texts from
336
+ different sources (for example, different paragraphs of the same text)
337
+ Returns:
338
+ tuple:
339
+ (initial_triplets, final_triplets, filtered_triplets, ontology_filtered_triplets)
340
+ """
341
+ self.extractor.reset_tokens()
342
+ self.extractor.reset_messages()
343
+ self.extractor.reset_error_state()
344
+
345
+ extracted_triplets = self.extractor.extract_triplets_from_text(text)
346
+
347
+ initial_triplets = []
348
+ for triplet in extracted_triplets["triplets"]:
349
+ triplet["prompt_token_num"], triplet["completion_token_num"] = (
350
+ self.extractor.calculate_used_tokens()
351
+ )
352
+ triplet["source_text_id"] = source_text_id
353
+ triplet["sample_id"] = sample_id
354
+ initial_triplets.append(triplet.copy())
355
+
356
+ final_triplets = []
357
+ filtered_triplets = []
358
+ ontology_filtered_triplets = []
359
+
360
+ for triplet in extracted_triplets["triplets"]:
361
+ self.extractor.reset_tokens()
362
+ try:
363
+ logger.log(logging.DEBUG, "Triplet: %s\n%s" % (str(triplet), "-" * 100))
364
+
365
+ # _____________ Refine entity types __________
366
+
367
+ (
368
+ refined_subject_type,
369
+ refined_subject_type_id,
370
+ refined_object_type,
371
+ refined_object_type_id,
372
+ ) = self._refine_entity_types(text=text, triplet=triplet)
373
+
374
+ # ________________ Refine relation ________________
375
+ (
376
+ refined_relation,
377
+ refined_relation_id,
378
+ refined_relation_direction,
379
+ prop_subject_type_ids,
380
+ prop_object_type_ids,
381
+ ) = self._refine_relation(
382
+ text=text,
383
+ triplet=triplet,
384
+ refined_subject_type_id=refined_subject_type_id,
385
+ refined_object_type_id=refined_object_type_id,
386
+ )
387
+
388
+ if refined_relation_direction == "inverse":
389
+ refined_subject_type_id, refined_object_type_id = (
390
+ refined_object_type_id,
391
+ refined_subject_type_id,
392
+ )
393
+
394
+ # __________ Refine entity names ___________
395
+ backbone_triplet = {
396
+ "subject": (
397
+ triplet["subject"]
398
+ if refined_relation_direction == "direct"
399
+ else triplet["object"]
400
+ ),
401
+ "relation": refined_relation,
402
+ "object": (
403
+ triplet["object"]
404
+ if refined_relation_direction == "direct"
405
+ else triplet["subject"]
406
+ ),
407
+ "subject_type": (
408
+ refined_subject_type
409
+ if refined_relation_direction == "direct"
410
+ else refined_object_type
411
+ ),
412
+ "object_type": (
413
+ refined_object_type
414
+ if refined_relation_direction == "direct"
415
+ else refined_subject_type
416
+ ),
417
+ }
418
+
419
+ backbone_triplet["qualifiers"] = triplet["qualifiers"]
420
+ if refined_subject_type_id:
421
+ backbone_triplet["subject"] = self._refine_entity_name(
422
+ text, backbone_triplet, sample_id, is_object=False
423
+ )
424
+
425
+ if refined_object_type_id:
426
+ backbone_triplet["object"] = self._refine_entity_name(
427
+ text, backbone_triplet, sample_id, is_object=True
428
+ )
429
+
430
+ logger.log(
431
+ logging.DEBUG,
432
+ "Original subject name: %s\n%s"
433
+ % (str(backbone_triplet["subject"]), "-" * 100),
434
+ )
435
+ logger.log(
436
+ logging.DEBUG,
437
+ "Original object name: %s\n%s"
438
+ % (str(backbone_triplet["object"]), "-" * 100),
439
+ )
440
+ logger.log(
441
+ logging.DEBUG,
442
+ "Refined subject name: %s\n%s"
443
+ % (str(backbone_triplet["subject"]), "-" * 100),
444
+ )
445
+ logger.log(
446
+ logging.DEBUG,
447
+ "Refined object name: %s\n%s"
448
+ % (str(backbone_triplet["object"]), "-" * 100),
449
+ )
450
+
451
+ (
452
+ backbone_triplet["prompt_token_num"],
453
+ backbone_triplet["completion_token_num"],
454
+ ) = self.extractor.calculate_used_tokens()
455
+ backbone_triplet["source_text_id"] = source_text_id
456
+ backbone_triplet["sample_id"] = sample_id
457
+
458
+ # ___________________________ Validate backbone triplet ___________________________
459
+ backbone_triplet_valid, backbone_triplet_exception_msg = (
460
+ self._validate_backbone(
461
+ backbone_triplet["subject_type"],
462
+ backbone_triplet["object_type"],
463
+ backbone_triplet["relation"],
464
+ refined_object_type_id,
465
+ refined_subject_type_id,
466
+ refined_relation_id,
467
+ prop_subject_type_ids,
468
+ prop_object_type_ids,
469
+ )
470
+ )
471
+
472
+ if backbone_triplet_valid:
473
+ final_triplets.append(backbone_triplet.copy())
474
+ logger.log(
475
+ logging.DEBUG,
476
+ "Final triplet: %s\n%s" % (str(backbone_triplet), "-" * 100),
477
+ )
478
+ else:
479
+ logger.log(
480
+ logging.ERROR,
481
+ "Final triplet is ontology filtered: %s\n%s"
482
+ % (str(backbone_triplet), "-" * 100),
483
+ )
484
+ logger.log(
485
+ logging.ERROR,
486
+ "Exception: %s" % (str(backbone_triplet_exception_msg)),
487
+ )
488
+ logger.log(
489
+ logging.ERROR, "Refined relation: %s" % (str(refined_relation))
490
+ )
491
+ logger.log(
492
+ logging.ERROR,
493
+ "Refined subject type: %s" % (str(refined_subject_type)),
494
+ )
495
+ logger.log(
496
+ logging.ERROR,
497
+ "Refined object type: %s" % (str(refined_object_type)),
498
+ )
499
+
500
+ backbone_triplet["exception_text"] = backbone_triplet_exception_msg
501
+ ontology_filtered_triplets.append(backbone_triplet.copy())
502
+
503
+ except Exception as e:
504
+ backbone_triplet = triplet.copy()
505
+ (
506
+ backbone_triplet["prompt_token_num"],
507
+ backbone_triplet["completion_token_num"],
508
+ ) = self.extractor.calculate_used_tokens()
509
+ backbone_triplet["source_text_id"] = source_text_id
510
+ backbone_triplet["sample_id"] = sample_id
511
+ backbone_triplet["exception_text"] = str(e)
512
+ filtered_triplets.append(backbone_triplet.copy())
513
+ logger.log(
514
+ logging.INFO,
515
+ "Filtered triplet: %s\n%s" % (str(backbone_triplet), "-" * 100),
516
+ )
517
+ logger.log(logging.INFO, "Exception: %s" % (str(e)))
518
+
519
+ return (
520
+ initial_triplets,
521
+ final_triplets,
522
+ filtered_triplets,
523
+ ontology_filtered_triplets,
524
+ )
525
+
526
+ def extract_triplets_with_ontology_filtering_and_add_to_db(
527
+ self, text, sample_id=None, source_text_id=None
528
+ ):
529
+ """
530
+ Extract and refine knowledge graph triplets from text using LLM, then add them to the database.
531
+ Args:
532
+ text (str): Input text to extract triplets from
533
+ sample_id (str): Sample ID - used to distinguish graphs resulted in different launches/by different users
534
+ source_text_id (str): Optional, - used to distinguish text from different sources (e.g., different paragraphs of the same text)
535
+ Returns:
536
+ tuple: (initial_triplets, final_triplets, filtered_triplets, ontology_filtered_triplets)
537
+ """
538
+ (
539
+ initial_triplets,
540
+ final_triplets,
541
+ filtered_triplets,
542
+ ontology_filtered_triplets,
543
+ ) = self.extract_triplets_with_ontology_filtering(
544
+ text, sample_id=sample_id, source_text_id=source_text_id
545
+ )
546
+ if len(initial_triplets) > 0:
547
+ self.aligner.add_initial_triplets(initial_triplets, sample_id=sample_id)
548
+ if len(final_triplets) > 0:
549
+ self.aligner.add_triplets(final_triplets, sample_id=sample_id)
550
+ if len(filtered_triplets) > 0:
551
+ self.aligner.add_filtered_triplets(filtered_triplets, sample_id=sample_id)
552
+ if len(ontology_filtered_triplets) > 0:
553
+ self.aligner.add_ontology_filtered_triplets(
554
+ ontology_filtered_triplets, sample_id=sample_id
555
+ )
556
+ return (
557
+ initial_triplets,
558
+ final_triplets,
559
+ filtered_triplets,
560
+ ontology_filtered_triplets,
561
+ )
@@ -0,0 +1,111 @@
1
+ Metadata-Version: 2.4
2
+ Name: wikontic
3
+ Version: 0.0.3
4
+ Summary: Extract a knowledge graph with LLM from texts and perform QA over the resulted KG
5
+ Author-email: Alla Chepurova <chepurova.data@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/screemix/Wikontic
8
+ Project-URL: Issues, https://github.com/screemix/Wikontic/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: streamlit
15
+ Requires-Dist: numpy
16
+ Requires-Dist: pyvis
17
+ Requires-Dist: python-dotenv
18
+ Requires-Dist: pymongo
19
+ Requires-Dist: openai
20
+ Requires-Dist: tenacity
21
+ Requires-Dist: pathlib
22
+ Requires-Dist: typing
23
+ Requires-Dist: unidecode
24
+ Requires-Dist: torch>=2.4.0
25
+ Requires-Dist: transformers
26
+ Requires-Dist: dataclasses
27
+ Requires-Dist: pydantic
28
+ Requires-Dist: accelerate
29
+ Requires-Dist: langchain
30
+ Dynamic: license-file
31
+
32
+ ![Wikontic logo](/media/wikontic.png)
33
+
34
+ # Wikontic
35
+
36
+ **Build ontology-aware, Wikidata-aligned knowledge graphs from raw text using LLMs**
37
+
38
+ ---
39
+
40
+ ## 🚀 Overview
41
+
42
+ Knowledge Graphs (KGs) provide structured, verifiable representations of knowledge, enabling fact grounding and empowering large language models (LLMs) with up-to-date, real-world information. However, creating high-quality KGs from open-domain text is challenging due to issues like redundancy, inconsistency, and lack of alignment with formal ontologies.
43
+
44
+ **Wikontic** is a multi-stage pipeline for constructing ontology-aligned KGs from unstructured text using LLMs and Wikidata. It extracts candidate triples from raw text, then refines them through ontology-based typing, schema validation, and entity deduplication—resulting in compact, semantically coherent graphs.
45
+
46
+ ---
47
+
48
+ ## 📁 Repository Structure
49
+
50
+ - `preprocessing/constraint-preprocessing.ipynb`
51
+ Jupyter notebook for collecting constraint rules from Wikidata.
52
+
53
+ - `utils/`
54
+ Utilities for LLM-based triple extraction and alignment with Wikidata ontology rules.
55
+
56
+
57
+ - `utils/openai_utils.py`
58
+ `LLMTripletExtractor` class for LLM-based triple extraction.
59
+
60
+
61
+ ### To use ontology:
62
+
63
+ - `utils/ontology_mappings/`
64
+ JSON files containing ontology mappings from Wikidata.
65
+
66
+ - `utils/structured_inference_with_db.py`
67
+ - `StructuredInferenceWithDB` class: triple extraction and qa functions
68
+
69
+ - `utils/structured_aligner.py`
70
+ - `Aligner` class: ontology alignment and entity name refinement
71
+
72
+
73
+ ### Not to use ontology:
74
+ - `utils/inference_with_db.py`
75
+ - `InferenceWithDB` class: triple extraction and qa functions
76
+
77
+ - `utils/dynamic_aligner.py`
78
+ - `Aligner` class: entity and relation name refinement
79
+
80
+ ### Evaluation:
81
+ - `inference_and_eval`
82
+ - Scripts for building KGs for MuSiQue and HotPot datasets and evaluation of QA performance
83
+ - `analysis`
84
+ - Notebooks with downstream analysis of the resulted KG
85
+
86
+ ### Use Wikontic as a service:
87
+
88
+ - `pages/` and `Wikontic.py`
89
+ Code for the web service for knowledge graph extraction and visualization.
90
+
91
+ - `Dockerfile`
92
+ For building a containerized web service.
93
+
94
+
95
+ ---
96
+
97
+ ## 🏁 Getting Started
98
+
99
+ 1. **Set up the ontology and KG databases:**
100
+ ```
101
+ ./setup_db.sh
102
+ ```
103
+
104
+ 2. **Launch the web service:**
105
+ ```
106
+ streamlit run Wikontic.py
107
+ ```
108
+
109
+ ---
110
+
111
+ Enjoy building knowledge graphs with Wikontic!