wikontic 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikontic/__init__.py +16 -0
- wikontic/create_ontological_triplets_db.py +193 -0
- wikontic/create_triplets_db.py +259 -0
- wikontic/create_wikidata_ontology_db.py +555 -0
- wikontic/utils/__init__.py +7 -0
- wikontic/utils/base_inference_with_db.py +329 -0
- wikontic/utils/dynamic_aligner.py +281 -0
- wikontic/utils/inference_with_db.py +224 -0
- wikontic/utils/ontology_mappings/entity_hierarchy.json +1 -0
- wikontic/utils/ontology_mappings/entity_names.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2aliases.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2hierarchy.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2label.json +1 -0
- wikontic/utils/ontology_mappings/enum_entity_ids.json +1 -0
- wikontic/utils/ontology_mappings/enum_prop_ids.json +1 -0
- wikontic/utils/ontology_mappings/label2entity.json +1 -0
- wikontic/utils/ontology_mappings/obj_constraint2prop.json +1 -0
- wikontic/utils/ontology_mappings/prop2aliases.json +1 -0
- wikontic/utils/ontology_mappings/prop2constraints.json +1 -0
- wikontic/utils/ontology_mappings/prop2data_type.json +1 -0
- wikontic/utils/ontology_mappings/prop2label.json +1 -0
- wikontic/utils/ontology_mappings/propid2enum.json +1 -0
- wikontic/utils/ontology_mappings/subj_constraint2prop.json +1 -0
- wikontic/utils/ontology_mappings/subject_object_constraints.json +1 -0
- wikontic/utils/openai_utils.py +517 -0
- wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types.txt +17 -0
- wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types_dialog_bench.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_object_names.txt +17 -0
- wikontic/utils/prompts/name_refinement/rank_object_names_dialog_bench.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_object_qualifiers.txt +20 -0
- wikontic/utils/prompts/name_refinement/rank_subject_names.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_subject_names_dialog_bench.txt +20 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_entity_types.txt +26 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_relation.txt +24 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_relation_and_types.txt +28 -0
- wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question.txt +17 -0
- wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question_wo_types.txt +16 -0
- wikontic/utils/prompts/qa/prompt_entity_extraction_from_question.txt +3 -0
- wikontic/utils/prompts/qa/prompt_is_answered.txt +43 -0
- wikontic/utils/prompts/qa/qa_collapsing_prompt.txt +22 -0
- wikontic/utils/prompts/qa/qa_prompt.txt +5 -0
- wikontic/utils/prompts/qa/qa_prompt_hotpot.txt +6 -0
- wikontic/utils/prompts/qa/question_decomposition_1.txt +7 -0
- wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt +75 -0
- wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench_in_russian.txt +78 -0
- wikontic/utils/prompts/triplet_extraction/propmt_1_types_qualifiers.txt +91 -0
- wikontic/utils/structured_aligner.py +606 -0
- wikontic/utils/structured_inference_with_db.py +561 -0
- wikontic-0.0.3.dist-info/METADATA +111 -0
- wikontic-0.0.3.dist-info/RECORD +53 -0
- wikontic-0.0.3.dist-info/WHEEL +5 -0
- wikontic-0.0.3.dist-info/licenses/LICENSE +19 -0
- wikontic-0.0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import warnings
|
|
3
|
+
from langchain.tools import tool
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from .base_inference_with_db import BaseInferenceWithDB
|
|
7
|
+
|
|
8
|
+
warnings.filterwarnings("ignore")
|
|
9
|
+
logger = logging.getLogger("InferenceWithDB")
|
|
10
|
+
logger.setLevel(logging.ERROR)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InferenceWithDB(BaseInferenceWithDB):
|
|
14
|
+
def __init__(self, extractor, aligner, triplets_db):
|
|
15
|
+
self.extractor = extractor
|
|
16
|
+
self.aligner = aligner
|
|
17
|
+
self.triplets_db = triplets_db
|
|
18
|
+
|
|
19
|
+
self.extract_triplets_tool = tool(self.extract_triplets)
|
|
20
|
+
self.extract_triplets_and_add_to_db_tool = tool(
|
|
21
|
+
self.extract_triplets_and_add_to_db
|
|
22
|
+
)
|
|
23
|
+
self.retrieve_similar_entity_names_tool = tool(
|
|
24
|
+
self.retrieve_similar_entity_names
|
|
25
|
+
)
|
|
26
|
+
self.identify_relevant_entities_from_question_tool = tool(
|
|
27
|
+
self.identify_relevant_entities_from_question_with_llm
|
|
28
|
+
)
|
|
29
|
+
self.get_1_hop_supporting_triplets_tool = tool(
|
|
30
|
+
self.get_1_hop_supporting_triplets
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def sanitize_string(self, s):
|
|
34
|
+
s = str(s).strip().replace('\\"', "")
|
|
35
|
+
if s.startswith(r"\u"):
|
|
36
|
+
s = s.encode().decode("unicode_escape")
|
|
37
|
+
return s.strip()
|
|
38
|
+
|
|
39
|
+
def extract_triplets(self, text, sample_id, source_text_id=None):
|
|
40
|
+
"""
|
|
41
|
+
Extract and refine knowledge graph triplets from text using LLM.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
text (str): Input text to extract triplets from
|
|
45
|
+
sample_id (str): Sample ID - used to distinguish graphs resulted
|
|
46
|
+
in different launches/by different users
|
|
47
|
+
source_text_id (str): Optional, - used to distinguish texts from
|
|
48
|
+
different sources (for example, different paragraphs of the same text)
|
|
49
|
+
Returns:
|
|
50
|
+
tuple:
|
|
51
|
+
(initial_triplets, final_triplets, filtered_triplets)
|
|
52
|
+
"""
|
|
53
|
+
self.extractor.reset_tokens()
|
|
54
|
+
self.extractor.reset_messages()
|
|
55
|
+
self.extractor.reset_error_state()
|
|
56
|
+
|
|
57
|
+
initial_triplets = []
|
|
58
|
+
|
|
59
|
+
extracted_triplets = self.extractor.extract_triplets_from_text(text)
|
|
60
|
+
for triplet in extracted_triplets["triplets"]:
|
|
61
|
+
triplet["prompt_token_num"], triplet["completion_token_num"] = (
|
|
62
|
+
self.extractor.calculate_used_tokens()
|
|
63
|
+
)
|
|
64
|
+
triplet["source_text_id"] = source_text_id
|
|
65
|
+
triplet["sample_id"] = sample_id
|
|
66
|
+
initial_triplets.append(triplet.copy())
|
|
67
|
+
|
|
68
|
+
final_triplets = []
|
|
69
|
+
filtered_triplets = []
|
|
70
|
+
|
|
71
|
+
for triplet in extracted_triplets["triplets"]:
|
|
72
|
+
self.extractor.reset_tokens()
|
|
73
|
+
try:
|
|
74
|
+
logger.log(logging.DEBUG, "Triplet: %s\n%s" % (str(triplet), "-" * 100))
|
|
75
|
+
refined_subject = self.refine_entity_name(
|
|
76
|
+
text, triplet, sample_id, is_object=False
|
|
77
|
+
)
|
|
78
|
+
refined_object = self.refine_entity_name(
|
|
79
|
+
text, triplet, sample_id, is_object=True
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
triplet["subject"] = refined_subject
|
|
83
|
+
triplet["object"] = refined_object
|
|
84
|
+
|
|
85
|
+
refined_relation = self.refine_relation_name(text, triplet, sample_id)
|
|
86
|
+
triplet["relation"] = refined_relation
|
|
87
|
+
|
|
88
|
+
final_triplets.append(triplet)
|
|
89
|
+
logger.log(
|
|
90
|
+
logging.DEBUG, "Final triplet: %s\n%s" % (str(triplet), "-" * 100)
|
|
91
|
+
)
|
|
92
|
+
logger.log(
|
|
93
|
+
logging.DEBUG,
|
|
94
|
+
"Refined subject: %s\n%s" % (str(refined_subject), "-" * 100),
|
|
95
|
+
)
|
|
96
|
+
logger.log(
|
|
97
|
+
logging.DEBUG,
|
|
98
|
+
"Refined object: %s\n%s" % (str(refined_object), "-" * 100),
|
|
99
|
+
)
|
|
100
|
+
logger.log(
|
|
101
|
+
logging.DEBUG,
|
|
102
|
+
"Refined relation: %s\n%s" % (str(refined_relation), "-" * 100),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
except Exception as e:
|
|
106
|
+
triplet["exception_text"] = str(e)
|
|
107
|
+
triplet["prompt_token_num"], triplet["completion_token_num"] = (
|
|
108
|
+
self.extractor.calculate_used_tokens()
|
|
109
|
+
)
|
|
110
|
+
triplet["sample_id"] = sample_id
|
|
111
|
+
filtered_triplets.append(triplet)
|
|
112
|
+
logger.log(
|
|
113
|
+
logging.INFO, "Filtered triplet: %s\n%s" % (str(triplet), "-" * 100)
|
|
114
|
+
)
|
|
115
|
+
logger.log(logging.INFO, "Exception: %s" % (str(e)))
|
|
116
|
+
|
|
117
|
+
return initial_triplets, final_triplets, filtered_triplets
|
|
118
|
+
|
|
119
|
+
def extract_triplets_and_add_to_db(self, text, source_text_id, sample_id=None):
|
|
120
|
+
"""
|
|
121
|
+
Extract and refine knowledge graph triplets from text using LLM, then add them to the database.
|
|
122
|
+
Args:
|
|
123
|
+
text (str): Input text to extract triplets from
|
|
124
|
+
sample_id (str): Sample ID - used to distinguish graphs resulted in different launches/by different users
|
|
125
|
+
source_text_id (str): Optional, - used to distinguish text from different sources (for example, different paragraphs of the same text)
|
|
126
|
+
Returns:
|
|
127
|
+
tuple: (initial_triplets, final_triplets, filtered_triplets, ontology_filtered_triplets)
|
|
128
|
+
"""
|
|
129
|
+
(
|
|
130
|
+
initial_triplets,
|
|
131
|
+
final_triplets,
|
|
132
|
+
filtered_triplets,
|
|
133
|
+
) = self.extract_triplets(text, sample_id, source_text_id)
|
|
134
|
+
if len(initial_triplets) > 0:
|
|
135
|
+
self.aligner.add_initial_triplets(initial_triplets, sample_id=sample_id)
|
|
136
|
+
if len(final_triplets) > 0:
|
|
137
|
+
self.aligner.add_triplets(final_triplets, sample_id=sample_id)
|
|
138
|
+
if len(filtered_triplets) > 0:
|
|
139
|
+
self.aligner.add_filtered_triplets(filtered_triplets, sample_id=sample_id)
|
|
140
|
+
return (
|
|
141
|
+
initial_triplets,
|
|
142
|
+
final_triplets,
|
|
143
|
+
filtered_triplets,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def refine_entity_name(self, text, triplet, sample_id, is_object=False):
|
|
147
|
+
"""
|
|
148
|
+
Refine entity names using type constraints.
|
|
149
|
+
"""
|
|
150
|
+
self.extractor.reset_error_state()
|
|
151
|
+
if is_object:
|
|
152
|
+
entity = triplet["object"]
|
|
153
|
+
else:
|
|
154
|
+
entity = triplet["subject"]
|
|
155
|
+
# entity = unidecode(entity)
|
|
156
|
+
entity = self.sanitize_string(entity)
|
|
157
|
+
|
|
158
|
+
similar_entities = self.aligner.retrieve_similar_entity_names(
|
|
159
|
+
entity_name=entity, sample_id=sample_id
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
similar_entities = [self.sanitize_string(entity) for entity in similar_entities]
|
|
163
|
+
|
|
164
|
+
# if there are similar entities -> refine entity name
|
|
165
|
+
# if no similar entities -> return the original entity
|
|
166
|
+
# if exact match found -> return the exact match
|
|
167
|
+
if len(similar_entities) == 0 or entity in similar_entities:
|
|
168
|
+
updated_entity = entity
|
|
169
|
+
else:
|
|
170
|
+
# if not exact match -> refine entity name
|
|
171
|
+
updated_entity = self.extractor.refine_entity(
|
|
172
|
+
text=text,
|
|
173
|
+
triplet=triplet,
|
|
174
|
+
candidates=similar_entities,
|
|
175
|
+
is_object=is_object,
|
|
176
|
+
)
|
|
177
|
+
# unidecode the updated entity
|
|
178
|
+
# updated_entity = unidecode(updated_entity)
|
|
179
|
+
updated_entity = self.sanitize_string(updated_entity)
|
|
180
|
+
# if the updated entity is None (meaning that LLM didn't find any similar entities)
|
|
181
|
+
# -> return the original entity
|
|
182
|
+
if re.sub(r"[^\w\s]", "", updated_entity) == "None":
|
|
183
|
+
updated_entity = entity
|
|
184
|
+
|
|
185
|
+
self.aligner.add_entity(
|
|
186
|
+
entity_name=updated_entity, alias=entity, sample_id=sample_id
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return updated_entity
|
|
190
|
+
|
|
191
|
+
def refine_relation_name(self, text, triplet, sample_id):
|
|
192
|
+
"""
|
|
193
|
+
Refine relation names using LLM.
|
|
194
|
+
"""
|
|
195
|
+
self.extractor.reset_error_state()
|
|
196
|
+
|
|
197
|
+
# relation = unidecode(triplet['relation'])
|
|
198
|
+
relation = self.sanitize_string(triplet["relation"])
|
|
199
|
+
|
|
200
|
+
similar_relations: List[str] = self.aligner.retrieve_similar_properties(
|
|
201
|
+
target_relation=relation, sample_id=sample_id
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
similar_relations = [
|
|
205
|
+
self.sanitize_string(relation) for relation in similar_relations
|
|
206
|
+
]
|
|
207
|
+
if len(similar_relations) == 0 or relation in similar_relations:
|
|
208
|
+
updated_relation = relation
|
|
209
|
+
else:
|
|
210
|
+
updated_relation = self.extractor.refine_relation_wo_entity_types(
|
|
211
|
+
text=text, triplet=triplet, candidate_relations=similar_relations
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# updated_relation = unidecode(updated_relation)
|
|
215
|
+
updated_relation = self.sanitize_string(updated_relation)
|
|
216
|
+
|
|
217
|
+
if re.sub(r"[^\w\s]", "", updated_relation) == "None":
|
|
218
|
+
updated_relation = relation
|
|
219
|
+
|
|
220
|
+
self.aligner.add_property(
|
|
221
|
+
property_name=updated_relation, alias=relation, sample_id=sample_id
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
return updated_relation
|