wikontic 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. wikontic/__init__.py +16 -0
  2. wikontic/create_ontological_triplets_db.py +193 -0
  3. wikontic/create_triplets_db.py +259 -0
  4. wikontic/create_wikidata_ontology_db.py +555 -0
  5. wikontic/utils/__init__.py +7 -0
  6. wikontic/utils/base_inference_with_db.py +329 -0
  7. wikontic/utils/dynamic_aligner.py +281 -0
  8. wikontic/utils/inference_with_db.py +224 -0
  9. wikontic/utils/ontology_mappings/entity_hierarchy.json +1 -0
  10. wikontic/utils/ontology_mappings/entity_names.json +1 -0
  11. wikontic/utils/ontology_mappings/entity_type2aliases.json +1 -0
  12. wikontic/utils/ontology_mappings/entity_type2hierarchy.json +1 -0
  13. wikontic/utils/ontology_mappings/entity_type2label.json +1 -0
  14. wikontic/utils/ontology_mappings/enum_entity_ids.json +1 -0
  15. wikontic/utils/ontology_mappings/enum_prop_ids.json +1 -0
  16. wikontic/utils/ontology_mappings/label2entity.json +1 -0
  17. wikontic/utils/ontology_mappings/obj_constraint2prop.json +1 -0
  18. wikontic/utils/ontology_mappings/prop2aliases.json +1 -0
  19. wikontic/utils/ontology_mappings/prop2constraints.json +1 -0
  20. wikontic/utils/ontology_mappings/prop2data_type.json +1 -0
  21. wikontic/utils/ontology_mappings/prop2label.json +1 -0
  22. wikontic/utils/ontology_mappings/propid2enum.json +1 -0
  23. wikontic/utils/ontology_mappings/subj_constraint2prop.json +1 -0
  24. wikontic/utils/ontology_mappings/subject_object_constraints.json +1 -0
  25. wikontic/utils/openai_utils.py +517 -0
  26. wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types.txt +17 -0
  27. wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types_dialog_bench.txt +18 -0
  28. wikontic/utils/prompts/name_refinement/rank_object_names.txt +17 -0
  29. wikontic/utils/prompts/name_refinement/rank_object_names_dialog_bench.txt +18 -0
  30. wikontic/utils/prompts/name_refinement/rank_object_qualifiers.txt +20 -0
  31. wikontic/utils/prompts/name_refinement/rank_subject_names.txt +18 -0
  32. wikontic/utils/prompts/name_refinement/rank_subject_names_dialog_bench.txt +20 -0
  33. wikontic/utils/prompts/ontology_refinement/prompt_choose_entity_types.txt +26 -0
  34. wikontic/utils/prompts/ontology_refinement/prompt_choose_relation.txt +24 -0
  35. wikontic/utils/prompts/ontology_refinement/prompt_choose_relation_and_types.txt +28 -0
  36. wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question.txt +17 -0
  37. wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question_wo_types.txt +16 -0
  38. wikontic/utils/prompts/qa/prompt_entity_extraction_from_question.txt +3 -0
  39. wikontic/utils/prompts/qa/prompt_is_answered.txt +43 -0
  40. wikontic/utils/prompts/qa/qa_collapsing_prompt.txt +22 -0
  41. wikontic/utils/prompts/qa/qa_prompt.txt +5 -0
  42. wikontic/utils/prompts/qa/qa_prompt_hotpot.txt +6 -0
  43. wikontic/utils/prompts/qa/question_decomposition_1.txt +7 -0
  44. wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt +75 -0
  45. wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench_in_russian.txt +78 -0
  46. wikontic/utils/prompts/triplet_extraction/propmt_1_types_qualifiers.txt +91 -0
  47. wikontic/utils/structured_aligner.py +606 -0
  48. wikontic/utils/structured_inference_with_db.py +561 -0
  49. wikontic-0.0.3.dist-info/METADATA +111 -0
  50. wikontic-0.0.3.dist-info/RECORD +53 -0
  51. wikontic-0.0.3.dist-info/WHEEL +5 -0
  52. wikontic-0.0.3.dist-info/licenses/LICENSE +19 -0
  53. wikontic-0.0.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,224 @@
1
+ import re
2
+ import warnings
3
+ from langchain.tools import tool
4
+ import logging
5
+
6
+ from .base_inference_with_db import BaseInferenceWithDB
7
+
8
+ warnings.filterwarnings("ignore")
9
+ logger = logging.getLogger("InferenceWithDB")
10
+ logger.setLevel(logging.ERROR)
11
+
12
+
13
+ class InferenceWithDB(BaseInferenceWithDB):
14
+ def __init__(self, extractor, aligner, triplets_db):
15
+ self.extractor = extractor
16
+ self.aligner = aligner
17
+ self.triplets_db = triplets_db
18
+
19
+ self.extract_triplets_tool = tool(self.extract_triplets)
20
+ self.extract_triplets_and_add_to_db_tool = tool(
21
+ self.extract_triplets_and_add_to_db
22
+ )
23
+ self.retrieve_similar_entity_names_tool = tool(
24
+ self.retrieve_similar_entity_names
25
+ )
26
+ self.identify_relevant_entities_from_question_tool = tool(
27
+ self.identify_relevant_entities_from_question_with_llm
28
+ )
29
+ self.get_1_hop_supporting_triplets_tool = tool(
30
+ self.get_1_hop_supporting_triplets
31
+ )
32
+
33
+ def sanitize_string(self, s):
34
+ s = str(s).strip().replace('\\"', "")
35
+ if s.startswith(r"\u"):
36
+ s = s.encode().decode("unicode_escape")
37
+ return s.strip()
38
+
39
+ def extract_triplets(self, text, sample_id, source_text_id=None):
40
+ """
41
+ Extract and refine knowledge graph triplets from text using LLM.
42
+
43
+ Args:
44
+ text (str): Input text to extract triplets from
45
+ sample_id (str): Sample ID - used to distinguish graphs resulted
46
+ in different launches/by different users
47
+ source_text_id (str): Optional, - used to distinguish texts from
48
+ different sources (for example, different paragraphs of the same text)
49
+ Returns:
50
+ tuple:
51
+ (initial_triplets, final_triplets, filtered_triplets)
52
+ """
53
+ self.extractor.reset_tokens()
54
+ self.extractor.reset_messages()
55
+ self.extractor.reset_error_state()
56
+
57
+ initial_triplets = []
58
+
59
+ extracted_triplets = self.extractor.extract_triplets_from_text(text)
60
+ for triplet in extracted_triplets["triplets"]:
61
+ triplet["prompt_token_num"], triplet["completion_token_num"] = (
62
+ self.extractor.calculate_used_tokens()
63
+ )
64
+ triplet["source_text_id"] = source_text_id
65
+ triplet["sample_id"] = sample_id
66
+ initial_triplets.append(triplet.copy())
67
+
68
+ final_triplets = []
69
+ filtered_triplets = []
70
+
71
+ for triplet in extracted_triplets["triplets"]:
72
+ self.extractor.reset_tokens()
73
+ try:
74
+ logger.log(logging.DEBUG, "Triplet: %s\n%s" % (str(triplet), "-" * 100))
75
+ refined_subject = self.refine_entity_name(
76
+ text, triplet, sample_id, is_object=False
77
+ )
78
+ refined_object = self.refine_entity_name(
79
+ text, triplet, sample_id, is_object=True
80
+ )
81
+
82
+ triplet["subject"] = refined_subject
83
+ triplet["object"] = refined_object
84
+
85
+ refined_relation = self.refine_relation_name(text, triplet, sample_id)
86
+ triplet["relation"] = refined_relation
87
+
88
+ final_triplets.append(triplet)
89
+ logger.log(
90
+ logging.DEBUG, "Final triplet: %s\n%s" % (str(triplet), "-" * 100)
91
+ )
92
+ logger.log(
93
+ logging.DEBUG,
94
+ "Refined subject: %s\n%s" % (str(refined_subject), "-" * 100),
95
+ )
96
+ logger.log(
97
+ logging.DEBUG,
98
+ "Refined object: %s\n%s" % (str(refined_object), "-" * 100),
99
+ )
100
+ logger.log(
101
+ logging.DEBUG,
102
+ "Refined relation: %s\n%s" % (str(refined_relation), "-" * 100),
103
+ )
104
+
105
+ except Exception as e:
106
+ triplet["exception_text"] = str(e)
107
+ triplet["prompt_token_num"], triplet["completion_token_num"] = (
108
+ self.extractor.calculate_used_tokens()
109
+ )
110
+ triplet["sample_id"] = sample_id
111
+ filtered_triplets.append(triplet)
112
+ logger.log(
113
+ logging.INFO, "Filtered triplet: %s\n%s" % (str(triplet), "-" * 100)
114
+ )
115
+ logger.log(logging.INFO, "Exception: %s" % (str(e)))
116
+
117
+ return initial_triplets, final_triplets, filtered_triplets
118
+
119
+ def extract_triplets_and_add_to_db(self, text, source_text_id, sample_id=None):
120
+ """
121
+ Extract and refine knowledge graph triplets from text using LLM, then add them to the database.
122
+ Args:
123
+ text (str): Input text to extract triplets from
124
+ sample_id (str): Sample ID - used to distinguish graphs resulted in different launches/by different users
125
+ source_text_id (str): Optional, - used to distinguish text from different sources (for example, different paragraphs of the same text)
126
+ Returns:
127
+ tuple: (initial_triplets, final_triplets, filtered_triplets, ontology_filtered_triplets)
128
+ """
129
+ (
130
+ initial_triplets,
131
+ final_triplets,
132
+ filtered_triplets,
133
+ ) = self.extract_triplets(text, sample_id, source_text_id)
134
+ if len(initial_triplets) > 0:
135
+ self.aligner.add_initial_triplets(initial_triplets, sample_id=sample_id)
136
+ if len(final_triplets) > 0:
137
+ self.aligner.add_triplets(final_triplets, sample_id=sample_id)
138
+ if len(filtered_triplets) > 0:
139
+ self.aligner.add_filtered_triplets(filtered_triplets, sample_id=sample_id)
140
+ return (
141
+ initial_triplets,
142
+ final_triplets,
143
+ filtered_triplets,
144
+ )
145
+
146
+ def refine_entity_name(self, text, triplet, sample_id, is_object=False):
147
+ """
148
+ Refine entity names using type constraints.
149
+ """
150
+ self.extractor.reset_error_state()
151
+ if is_object:
152
+ entity = triplet["object"]
153
+ else:
154
+ entity = triplet["subject"]
155
+ # entity = unidecode(entity)
156
+ entity = self.sanitize_string(entity)
157
+
158
+ similar_entities = self.aligner.retrieve_similar_entity_names(
159
+ entity_name=entity, sample_id=sample_id
160
+ )
161
+
162
+ similar_entities = [self.sanitize_string(entity) for entity in similar_entities]
163
+
164
+ # if there are similar entities -> refine entity name
165
+ # if no similar entities -> return the original entity
166
+ # if exact match found -> return the exact match
167
+ if len(similar_entities) == 0 or entity in similar_entities:
168
+ updated_entity = entity
169
+ else:
170
+ # if not exact match -> refine entity name
171
+ updated_entity = self.extractor.refine_entity(
172
+ text=text,
173
+ triplet=triplet,
174
+ candidates=similar_entities,
175
+ is_object=is_object,
176
+ )
177
+ # unidecode the updated entity
178
+ # updated_entity = unidecode(updated_entity)
179
+ updated_entity = self.sanitize_string(updated_entity)
180
+ # if the updated entity is None (meaning that LLM didn't find any similar entities)
181
+ # -> return the original entity
182
+ if re.sub(r"[^\w\s]", "", updated_entity) == "None":
183
+ updated_entity = entity
184
+
185
+ self.aligner.add_entity(
186
+ entity_name=updated_entity, alias=entity, sample_id=sample_id
187
+ )
188
+
189
+ return updated_entity
190
+
191
+ def refine_relation_name(self, text, triplet, sample_id):
192
+ """
193
+ Refine relation names using LLM.
194
+ """
195
+ self.extractor.reset_error_state()
196
+
197
+ # relation = unidecode(triplet['relation'])
198
+ relation = self.sanitize_string(triplet["relation"])
199
+
200
+ similar_relations: List[str] = self.aligner.retrieve_similar_properties(
201
+ target_relation=relation, sample_id=sample_id
202
+ )
203
+
204
+ similar_relations = [
205
+ self.sanitize_string(relation) for relation in similar_relations
206
+ ]
207
+ if len(similar_relations) == 0 or relation in similar_relations:
208
+ updated_relation = relation
209
+ else:
210
+ updated_relation = self.extractor.refine_relation_wo_entity_types(
211
+ text=text, triplet=triplet, candidate_relations=similar_relations
212
+ )
213
+
214
+ # updated_relation = unidecode(updated_relation)
215
+ updated_relation = self.sanitize_string(updated_relation)
216
+
217
+ if re.sub(r"[^\w\s]", "", updated_relation) == "None":
218
+ updated_relation = relation
219
+
220
+ self.aligner.add_property(
221
+ property_name=updated_relation, alias=relation, sample_id=sample_id
222
+ )
223
+
224
+ return updated_relation