wikontic 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikontic/__init__.py +16 -0
- wikontic/create_ontological_triplets_db.py +193 -0
- wikontic/create_triplets_db.py +259 -0
- wikontic/create_wikidata_ontology_db.py +555 -0
- wikontic/utils/__init__.py +7 -0
- wikontic/utils/base_inference_with_db.py +329 -0
- wikontic/utils/dynamic_aligner.py +281 -0
- wikontic/utils/inference_with_db.py +224 -0
- wikontic/utils/ontology_mappings/entity_hierarchy.json +1 -0
- wikontic/utils/ontology_mappings/entity_names.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2aliases.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2hierarchy.json +1 -0
- wikontic/utils/ontology_mappings/entity_type2label.json +1 -0
- wikontic/utils/ontology_mappings/enum_entity_ids.json +1 -0
- wikontic/utils/ontology_mappings/enum_prop_ids.json +1 -0
- wikontic/utils/ontology_mappings/label2entity.json +1 -0
- wikontic/utils/ontology_mappings/obj_constraint2prop.json +1 -0
- wikontic/utils/ontology_mappings/prop2aliases.json +1 -0
- wikontic/utils/ontology_mappings/prop2constraints.json +1 -0
- wikontic/utils/ontology_mappings/prop2data_type.json +1 -0
- wikontic/utils/ontology_mappings/prop2label.json +1 -0
- wikontic/utils/ontology_mappings/propid2enum.json +1 -0
- wikontic/utils/ontology_mappings/subj_constraint2prop.json +1 -0
- wikontic/utils/ontology_mappings/subject_object_constraints.json +1 -0
- wikontic/utils/openai_utils.py +517 -0
- wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types.txt +17 -0
- wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types_dialog_bench.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_object_names.txt +17 -0
- wikontic/utils/prompts/name_refinement/rank_object_names_dialog_bench.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_object_qualifiers.txt +20 -0
- wikontic/utils/prompts/name_refinement/rank_subject_names.txt +18 -0
- wikontic/utils/prompts/name_refinement/rank_subject_names_dialog_bench.txt +20 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_entity_types.txt +26 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_relation.txt +24 -0
- wikontic/utils/prompts/ontology_refinement/prompt_choose_relation_and_types.txt +28 -0
- wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question.txt +17 -0
- wikontic/utils/prompts/qa/prompt_choose_relevant_entities_for_question_wo_types.txt +16 -0
- wikontic/utils/prompts/qa/prompt_entity_extraction_from_question.txt +3 -0
- wikontic/utils/prompts/qa/prompt_is_answered.txt +43 -0
- wikontic/utils/prompts/qa/qa_collapsing_prompt.txt +22 -0
- wikontic/utils/prompts/qa/qa_prompt.txt +5 -0
- wikontic/utils/prompts/qa/qa_prompt_hotpot.txt +6 -0
- wikontic/utils/prompts/qa/question_decomposition_1.txt +7 -0
- wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt +75 -0
- wikontic/utils/prompts/triplet_extraction/prompt_1_types_qualifiers_dialog_bench_in_russian.txt +78 -0
- wikontic/utils/prompts/triplet_extraction/propmt_1_types_qualifiers.txt +91 -0
- wikontic/utils/structured_aligner.py +606 -0
- wikontic/utils/structured_inference_with_db.py +561 -0
- wikontic-0.0.3.dist-info/METADATA +111 -0
- wikontic-0.0.3.dist-info/RECORD +53 -0
- wikontic-0.0.3.dist-info/WHEEL +5 -0
- wikontic-0.0.3.dist-info/licenses/LICENSE +19 -0
- wikontic-0.0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
import openai
|
|
2
|
+
import os
|
|
3
|
+
from dotenv import load_dotenv, find_dotenv
|
|
4
|
+
from tenacity import (
|
|
5
|
+
retry,
|
|
6
|
+
wait_random_exponential,
|
|
7
|
+
before_sleep_log,
|
|
8
|
+
stop_after_attempt,
|
|
9
|
+
)
|
|
10
|
+
import logging
|
|
11
|
+
import sys
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Dict, List, Union, Optional
|
|
16
|
+
import tenacity
|
|
17
|
+
import httpx
|
|
18
|
+
|
|
19
|
+
# Configure logging
|
|
20
|
+
logging.basicConfig(stream=sys.stderr, level=logging.WARNING)
|
|
21
|
+
logger = logging.getLogger("OpenAIUtils")
|
|
22
|
+
logger.setLevel(logging.ERROR)
|
|
23
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
24
|
+
|
|
25
|
+
_ = load_dotenv(find_dotenv())
|
|
26
|
+
# OpenAI
|
|
27
|
+
client = openai.OpenAI(api_key=os.getenv("KEY"))
|
|
28
|
+
|
|
29
|
+
MAX_ATTEMPTS = 1
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class LLMTripletExtractor:
|
|
33
|
+
"""A class for extracting and processing knowledge graph triplets using OpenAI's LLMs."""
|
|
34
|
+
|
|
35
|
+
MODEL_PRICES = {
|
|
36
|
+
"gpt-4o": {"input": 2.5, "output": 10},
|
|
37
|
+
"gpt-4o-mini": {"input": 0.15, "output": 0.6},
|
|
38
|
+
"gpt-4.1-mini": {"input": 0.4, "output": 1.6},
|
|
39
|
+
"gpt-4.1": {"input": 2.0, "output": 8.0},
|
|
40
|
+
"Meta-llama/Llama-3.3-70B-Instruct": {"input": 0.04, "output": 0.12},
|
|
41
|
+
"qwen/qwen3-32b": {"input": 0.05, "output": 0.2},
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
prompt_folder_path: str = str(Path(__file__).parent / "prompts"),
|
|
47
|
+
system_prompt_paths: Optional[Dict[str, str]] = None,
|
|
48
|
+
model: str = "gpt-4o",
|
|
49
|
+
max_attempts=MAX_ATTEMPTS,
|
|
50
|
+
):
|
|
51
|
+
"""
|
|
52
|
+
Initialize the LLMTripletExtractor.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
prompt_folder_path: Path to folder containing prompt files
|
|
56
|
+
system_prompt_paths: Dictionary mapping prompt types to file paths
|
|
57
|
+
model: Name of the OpenAI model to use
|
|
58
|
+
"""
|
|
59
|
+
if system_prompt_paths is None:
|
|
60
|
+
system_prompt_paths = {
|
|
61
|
+
"triplet_extraction": "triplet_extraction/propmt_1_types_qualifiers.txt",
|
|
62
|
+
# 'triplet_extraction': 'triplet_extraction/prompt_1_types_qualifiers_dialog_bench.txt',
|
|
63
|
+
"relation_entity_types_ranker": "ontology_refinement/prompt_choose_relation_and_types.txt",
|
|
64
|
+
"relation_ranker": "ontology_refinement/prompt_choose_relation.txt",
|
|
65
|
+
"entity_types_ranker": "ontology_refinement/prompt_choose_entity_types.txt",
|
|
66
|
+
"relation_ranker_wo_entity_types": "name_refinement/prompt_choose_relation_wo_entity_types.txt",
|
|
67
|
+
# 'relation_ranker_wo_entity_types': 'name_refinement/prompt_choose_relation_wo_entity_types_dialog_bench.txt',
|
|
68
|
+
# 'subject_ranker': 'name_refinement/rank_subject_names_dialog_bench.txt',
|
|
69
|
+
"subject_ranker": "name_refinement/rank_subject_names.txt",
|
|
70
|
+
# 'object_ranker': 'name_refinement/rank_object_names_dialog_bench.txt',
|
|
71
|
+
"object_ranker": "name_refinement/rank_object_names.txt",
|
|
72
|
+
"quailfier_object_ranker": "name_refinement/rank_object_qualifiers.txt",
|
|
73
|
+
"question_entity_extractor": "qa/prompt_entity_extraction_from_question.txt",
|
|
74
|
+
"question_entity_ranker": "qa/prompt_choose_relevant_entities_for_question.txt",
|
|
75
|
+
"question_entity_ranker_wo_types": "qa/prompt_choose_relevant_entities_for_question_wo_types.txt",
|
|
76
|
+
# 'qa': 'qa_prompt_hotpot.txt'
|
|
77
|
+
"question_decomposition_1": "qa/question_decomposition_1.txt",
|
|
78
|
+
"qa_collapsing": "qa/qa_collapsing_prompt.txt",
|
|
79
|
+
"qa_is_answered": "qa/prompt_is_answered.txt",
|
|
80
|
+
"qa": "qa/qa_prompt.txt",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# Load all prompts
|
|
84
|
+
prompt_folder = Path(prompt_folder_path)
|
|
85
|
+
self.prompts = {}
|
|
86
|
+
for prompt_type, filename in system_prompt_paths.items():
|
|
87
|
+
with open(prompt_folder / filename) as f:
|
|
88
|
+
self.prompts[prompt_type] = f.read()
|
|
89
|
+
|
|
90
|
+
self.model = model
|
|
91
|
+
self.messages = []
|
|
92
|
+
self.prompt_tokens_num = 0
|
|
93
|
+
self.completion_tokens_num = 0
|
|
94
|
+
self.current_cost = 0
|
|
95
|
+
|
|
96
|
+
self._refine_attempt = 0
|
|
97
|
+
self._prev_error = None # store previous exception
|
|
98
|
+
self.MAX_ATTEMPTS = max_attempts
|
|
99
|
+
|
|
100
|
+
# Set pricing
|
|
101
|
+
if model not in self.MODEL_PRICES:
|
|
102
|
+
raise ValueError(f"Unknown model: {model}")
|
|
103
|
+
self.input_price = self.MODEL_PRICES[model]["input"]
|
|
104
|
+
self.output_price = self.MODEL_PRICES[model]["output"]
|
|
105
|
+
|
|
106
|
+
def extract_json(self, text: str) -> Union[dict, list, str]:
|
|
107
|
+
"""Extract JSON from text, handling both code blocks and inline JSON."""
|
|
108
|
+
patterns = [
|
|
109
|
+
r"```json\s*(\{.*?\}|\[.*?\])\s*```", # JSON in code blocks
|
|
110
|
+
r"(\{.*?\}|\[.*?\])", # Inline JSON
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
return json.loads(text)
|
|
115
|
+
except json.JSONDecodeError:
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
for pattern in patterns:
|
|
119
|
+
match = re.search(pattern, text, re.DOTALL)
|
|
120
|
+
if match:
|
|
121
|
+
try:
|
|
122
|
+
return json.loads(match.group(1))
|
|
123
|
+
except json.JSONDecodeError:
|
|
124
|
+
logging.ERROR(f"Failed to parse JSON: {text}")
|
|
125
|
+
|
|
126
|
+
return text
|
|
127
|
+
|
|
128
|
+
@retry(
|
|
129
|
+
wait=wait_random_exponential(multiplier=1, max=60),
|
|
130
|
+
before_sleep=before_sleep_log(logger, logging.ERROR),
|
|
131
|
+
stop=stop_after_attempt(5),
|
|
132
|
+
)
|
|
133
|
+
def get_completion(
|
|
134
|
+
self, system_prompt: str, user_prompt: str, transform_to_json: bool = True
|
|
135
|
+
) -> Union[dict, list, str]:
|
|
136
|
+
"""Get completion from OpenAI API with retry logic."""
|
|
137
|
+
if self.model == "qwen/qwen3-32b":
|
|
138
|
+
user_prompt = "/no_think \n" + user_prompt
|
|
139
|
+
messages = [
|
|
140
|
+
{"role": "system", "content": system_prompt},
|
|
141
|
+
{"role": "user", "content": user_prompt},
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
response = client.chat.completions.create(
|
|
145
|
+
model=self.model, messages=messages, temperature=0
|
|
146
|
+
)
|
|
147
|
+
self.completion_tokens_num += response.usage.completion_tokens
|
|
148
|
+
self.prompt_tokens_num += response.usage.prompt_tokens
|
|
149
|
+
self.current_cost += (
|
|
150
|
+
response.usage.completion_tokens * self.output_price
|
|
151
|
+
+ response.usage.prompt_tokens * self.input_price
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
content = response.choices[0].message.content.strip()
|
|
155
|
+
logger.debug("Output content: %s\n%s", str(content), "-" * 100)
|
|
156
|
+
output = self.extract_json(content) if transform_to_json else content
|
|
157
|
+
|
|
158
|
+
self.messages = messages + [{"role": "assistant", "content": output}]
|
|
159
|
+
return output
|
|
160
|
+
|
|
161
|
+
@tenacity.retry(stop=tenacity.stop_after_attempt(MAX_ATTEMPTS), reraise=True)
|
|
162
|
+
def extract_triplets_from_text(self, text: str) -> dict:
|
|
163
|
+
"""Extract knowledge graph triplets from text."""
|
|
164
|
+
|
|
165
|
+
self._refine_attempt += 1
|
|
166
|
+
attempt = self._refine_attempt
|
|
167
|
+
logger.log(
|
|
168
|
+
logging.DEBUG,
|
|
169
|
+
"Attempt of a function call extract_triplets_from_text: %s",
|
|
170
|
+
attempt,
|
|
171
|
+
)
|
|
172
|
+
system_prompt = self.prompts["triplet_extraction"]
|
|
173
|
+
if attempt > 1:
|
|
174
|
+
prev_error = self._prev_error
|
|
175
|
+
system_prompt += f"\n(Previous attempt #{attempt-1} failed with error: {prev_error}. Please adjust your answer!)"
|
|
176
|
+
logger.log(logging.ERROR, "System prompt: %s", system_prompt)
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
return self.get_completion(
|
|
180
|
+
system_prompt=system_prompt, user_prompt=f'Text: "{text}"'
|
|
181
|
+
)
|
|
182
|
+
except Exception as e:
|
|
183
|
+
self._prev_error = e
|
|
184
|
+
# if json from output is broken after 3 attempts - raise an exception
|
|
185
|
+
logger.log(logging.ERROR, str(e))
|
|
186
|
+
if attempt > self.MAX_ATTEMPTS:
|
|
187
|
+
raise e
|
|
188
|
+
|
|
189
|
+
@tenacity.retry(stop=tenacity.stop_after_attempt(MAX_ATTEMPTS), reraise=True)
|
|
190
|
+
def refine_entity_types(
|
|
191
|
+
self,
|
|
192
|
+
text: str,
|
|
193
|
+
triplet: dict,
|
|
194
|
+
candidate_subject_types: List[str],
|
|
195
|
+
candidate_object_types: List[str],
|
|
196
|
+
) -> dict:
|
|
197
|
+
"""Refine relations and entity types using candidate backbone triplets."""
|
|
198
|
+
triplet_filtered = {
|
|
199
|
+
k: triplet[k]
|
|
200
|
+
for k in ["subject", "relation", "object", "subject_type", "object_type"]
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
candidates_subject_types_str = json.dumps(candidate_subject_types)
|
|
204
|
+
candidates_object_types_str = json.dumps(candidate_object_types)
|
|
205
|
+
logger.log(
|
|
206
|
+
logging.DEBUG,
|
|
207
|
+
"candidates subject types: %s\n%s",
|
|
208
|
+
str(candidates_subject_types_str),
|
|
209
|
+
"-" * 100,
|
|
210
|
+
)
|
|
211
|
+
logger.log(
|
|
212
|
+
logging.DEBUG,
|
|
213
|
+
"candidates object types: %s\n%s",
|
|
214
|
+
str(candidates_object_types_str),
|
|
215
|
+
"-" * 100,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
self._refine_attempt += 1
|
|
219
|
+
attempt = self._refine_attempt
|
|
220
|
+
logger.log(
|
|
221
|
+
logging.DEBUG, "Attempt of a function call refine_entity_types: %s", attempt
|
|
222
|
+
)
|
|
223
|
+
system_prompt = self.prompts["entity_types_ranker"]
|
|
224
|
+
if attempt > 1:
|
|
225
|
+
prev_error = self._prev_error
|
|
226
|
+
system_prompt += f"\n(Previous attempt #{attempt-1} failed with error: {prev_error}. Please adjust your answer!)"
|
|
227
|
+
logger.log(logging.ERROR, "System prompt: %s", system_prompt)
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
output = self.get_completion(
|
|
231
|
+
system_prompt=system_prompt,
|
|
232
|
+
user_prompt=f'Text: "{text}\nExtracted Triplet: {json.dumps(triplet_filtered)}\n'
|
|
233
|
+
f"Candidate Subject Types: {candidates_subject_types_str}\n"
|
|
234
|
+
f"Candidate Object Types: {candidates_object_types_str}",
|
|
235
|
+
)
|
|
236
|
+
except Exception as e:
|
|
237
|
+
self._prev_error = e
|
|
238
|
+
logger.log(logging.ERROR, str(e))
|
|
239
|
+
# if json from output is broken after 3 attempts - raise an exception
|
|
240
|
+
if attempt > self.MAX_ATTEMPTS:
|
|
241
|
+
raise e
|
|
242
|
+
|
|
243
|
+
logger.log(
|
|
244
|
+
logging.DEBUG,
|
|
245
|
+
"refined subject type: %s\n%s",
|
|
246
|
+
str(output["subject_type"]),
|
|
247
|
+
"-" * 100,
|
|
248
|
+
)
|
|
249
|
+
logger.log(
|
|
250
|
+
logging.DEBUG,
|
|
251
|
+
"refined object type: %s\n%s",
|
|
252
|
+
str(output["object_type"]),
|
|
253
|
+
"-" * 100,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
try:
|
|
257
|
+
assert (
|
|
258
|
+
output["subject_type"] in candidate_subject_types
|
|
259
|
+
), "Refined subject type is not in candidate subject types"
|
|
260
|
+
assert (
|
|
261
|
+
output["object_type"] in candidate_object_types
|
|
262
|
+
), "Refined object type is not in candidate object types"
|
|
263
|
+
except Exception as e:
|
|
264
|
+
self._prev_error = e
|
|
265
|
+
logger.log(logging.ERROR, str(e))
|
|
266
|
+
# do not raise an exception - save triplet in ontology filtered collection
|
|
267
|
+
return output
|
|
268
|
+
|
|
269
|
+
@tenacity.retry(stop=tenacity.stop_after_attempt(MAX_ATTEMPTS), reraise=True)
|
|
270
|
+
def refine_relation(
|
|
271
|
+
self, text: str, triplet: dict, candidate_relations: List[dict]
|
|
272
|
+
) -> dict:
|
|
273
|
+
"""Refine relation using candidate relations."""
|
|
274
|
+
triplet_filtered = {
|
|
275
|
+
k: triplet[k]
|
|
276
|
+
for k in ["subject", "relation", "object", "subject_type", "object_type"]
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
candidates_str = json.dumps(candidate_relations, ensure_ascii=False)
|
|
280
|
+
logger.log(
|
|
281
|
+
logging.DEBUG,
|
|
282
|
+
"candidates relations: %s\n%s",
|
|
283
|
+
str(candidates_str),
|
|
284
|
+
"-" * 100,
|
|
285
|
+
)
|
|
286
|
+
self._refine_attempt += 1
|
|
287
|
+
attempt = self._refine_attempt
|
|
288
|
+
|
|
289
|
+
logger.log(
|
|
290
|
+
logging.DEBUG, "Attempt of a function call refine_relation: %s", attempt
|
|
291
|
+
)
|
|
292
|
+
system_prompt = self.prompts["relation_ranker"]
|
|
293
|
+
|
|
294
|
+
if attempt > 1:
|
|
295
|
+
prev_error = self._prev_error
|
|
296
|
+
system_prompt += f"\n(Previous attempt #{attempt-1} failed with error {prev_error}. Please adjust your answer!)"
|
|
297
|
+
logger.log(logging.ERROR, "System prompt: %s", system_prompt)
|
|
298
|
+
try:
|
|
299
|
+
output = self.get_completion(
|
|
300
|
+
system_prompt=system_prompt,
|
|
301
|
+
user_prompt=f'Text: "{text}\nExtracted Triplet: {json.dumps(triplet_filtered, ensure_ascii=False)}\n'
|
|
302
|
+
f"Candidate relations: {candidates_str}",
|
|
303
|
+
transform_to_json=True,
|
|
304
|
+
)
|
|
305
|
+
except Exception as e:
|
|
306
|
+
self._prev_error = e
|
|
307
|
+
logger.log(logging.ERROR, str(e))
|
|
308
|
+
# if json from output is broken after 3 attempts - raise an exception
|
|
309
|
+
if attempt > self.MAX_ATTEMPTS:
|
|
310
|
+
raise e
|
|
311
|
+
|
|
312
|
+
logger.log(
|
|
313
|
+
logging.DEBUG,
|
|
314
|
+
"refined relation: %s\n%s",
|
|
315
|
+
str(output["relation"]),
|
|
316
|
+
"-" * 100,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
try:
|
|
320
|
+
assert (
|
|
321
|
+
output["relation"] in candidate_relations
|
|
322
|
+
), "Refined relation is not in candidate relations"
|
|
323
|
+
except Exception as e:
|
|
324
|
+
self._prev_error = e
|
|
325
|
+
logger.log(logging.ERROR, str(e))
|
|
326
|
+
# do not raise an exception - save triplet in ontology filtered collection
|
|
327
|
+
|
|
328
|
+
return output
|
|
329
|
+
|
|
330
|
+
@tenacity.retry(stop=tenacity.stop_after_attempt(MAX_ATTEMPTS), reraise=True)
|
|
331
|
+
def refine_relation_wo_entity_types(
|
|
332
|
+
self, text: str, triplet: dict, candidate_relations: List[dict]
|
|
333
|
+
) -> dict:
|
|
334
|
+
"""Refine relation using candidate relations."""
|
|
335
|
+
triplet_filtered = {k: triplet[k] for k in ["subject", "relation", "object"]}
|
|
336
|
+
candidates_str = json.dumps(candidate_relations, ensure_ascii=False)
|
|
337
|
+
logger.log(
|
|
338
|
+
logging.DEBUG,
|
|
339
|
+
"candidates relations: %s\n%s",
|
|
340
|
+
str(candidates_str),
|
|
341
|
+
"-" * 100,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
attempt = self._refine_attempt
|
|
345
|
+
|
|
346
|
+
logger.log(
|
|
347
|
+
logging.DEBUG,
|
|
348
|
+
"Attempt of a function call refine_relation_wo_entity_types: %s",
|
|
349
|
+
attempt,
|
|
350
|
+
)
|
|
351
|
+
self._refine_attempt += 1
|
|
352
|
+
system_prompt = self.prompts["relation_ranker_wo_entity_types"]
|
|
353
|
+
|
|
354
|
+
if attempt > 1:
|
|
355
|
+
prev_error = self._prev_error
|
|
356
|
+
system_prompt += f"\n(Previous attempt #{attempt-1} failed with error {prev_error}. Please adjust your answer!)"
|
|
357
|
+
logger.log(logging.ERROR, "System prompt: %s", system_prompt)
|
|
358
|
+
try:
|
|
359
|
+
return self.get_completion(
|
|
360
|
+
system_prompt=system_prompt,
|
|
361
|
+
user_prompt=f'Text: "{text}\nExtracted Triplet: {json.dumps(triplet_filtered, ensure_ascii=False)}\n'
|
|
362
|
+
f"Candidate relations: {candidates_str}",
|
|
363
|
+
transform_to_json=False,
|
|
364
|
+
)
|
|
365
|
+
except Exception as e:
|
|
366
|
+
self._prev_error = e
|
|
367
|
+
logger.log(logging.ERROR, str(e))
|
|
368
|
+
# if json from output is broken after 3 attempts - raise an exception
|
|
369
|
+
if self._refine_attempt > self.MAX_ATTEMPTS:
|
|
370
|
+
raise e
|
|
371
|
+
|
|
372
|
+
def refine_relation_and_entity_types(
|
|
373
|
+
self, text: str, triplet: dict, candidate_triplets: List[dict]
|
|
374
|
+
) -> dict:
|
|
375
|
+
"""Refine relations and entity types using candidate backbone triplets."""
|
|
376
|
+
triplet_filtered = {
|
|
377
|
+
k: triplet[k]
|
|
378
|
+
for k in ["subject", "relation", "object", "subject_type", "object_type"]
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
candidates_str = "".join(f"{json.dumps(c)}\n" for c in candidate_triplets)
|
|
382
|
+
|
|
383
|
+
return self.get_completion(
|
|
384
|
+
system_prompt=self.prompts["relation_entity_types_ranker"],
|
|
385
|
+
user_prompt=f'Text: "{text}\nExtracted Triplet: {json.dumps(triplet_filtered)}\n'
|
|
386
|
+
f"Candidate Triplets: {candidates_str}",
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
def refine_entity(
|
|
390
|
+
self,
|
|
391
|
+
text: str,
|
|
392
|
+
triplet: dict,
|
|
393
|
+
candidates: List[str],
|
|
394
|
+
is_object: bool = False,
|
|
395
|
+
role: str = "user",
|
|
396
|
+
) -> dict:
|
|
397
|
+
"""Refine subject/object names using candidate options from pre-built KG."""
|
|
398
|
+
|
|
399
|
+
triplet_filtered = {k: triplet[k] for k in ["subject", "relation", "object"]}
|
|
400
|
+
original_name = triplet_filtered["object" if is_object else "subject"]
|
|
401
|
+
|
|
402
|
+
self._refine_attempt += 1
|
|
403
|
+
attempt = self._refine_attempt
|
|
404
|
+
|
|
405
|
+
logger.log(
|
|
406
|
+
logging.DEBUG, "Attempt of a function call refine_entity: %s", attempt
|
|
407
|
+
)
|
|
408
|
+
prompt_key = "object_ranker" if is_object else "subject_ranker"
|
|
409
|
+
entity_type = "Object" if is_object else "Subject"
|
|
410
|
+
system_prompt = self.prompts[prompt_key]
|
|
411
|
+
|
|
412
|
+
if attempt > 1:
|
|
413
|
+
prev_error = self._prev_error
|
|
414
|
+
system_prompt += f"\n(Previous attempt #{attempt-1} failed with error: {prev_error}. Please adjust your answer!)"
|
|
415
|
+
logger.log(logging.ERROR, "System prompt: %s", system_prompt)
|
|
416
|
+
|
|
417
|
+
try:
|
|
418
|
+
return self.get_completion(
|
|
419
|
+
system_prompt=system_prompt,
|
|
420
|
+
user_prompt=f'Text: "{text}\nRole: {role}\nExtracted Triplet: {json.dumps(triplet_filtered, ensure_ascii=False)}\n'
|
|
421
|
+
f"Original {entity_type}: {original_name}\n"
|
|
422
|
+
f'Candidate {entity_type}s: {json.dumps(candidates, ensure_ascii=False)}"',
|
|
423
|
+
transform_to_json=False,
|
|
424
|
+
)
|
|
425
|
+
except Exception as e:
|
|
426
|
+
self._prev_error = e
|
|
427
|
+
logger.log(logging.ERROR, str(e))
|
|
428
|
+
# if json from output is broken after 3 attempts - raise an exception
|
|
429
|
+
if attempt > self.MAX_ATTEMPTS:
|
|
430
|
+
raise e
|
|
431
|
+
|
|
432
|
+
def extract_entities_from_question(self, question: str) -> dict:
|
|
433
|
+
"""Extract entities from a question."""
|
|
434
|
+
return self.get_completion(
|
|
435
|
+
system_prompt=self.prompts["question_entity_extractor"],
|
|
436
|
+
user_prompt=f"Question: {question}",
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
def identify_relevant_entities(
|
|
440
|
+
self, question: str, entity_list: List[str]
|
|
441
|
+
) -> List[str]:
|
|
442
|
+
"""Identify entities relevant to a question."""
|
|
443
|
+
return self.get_completion(
|
|
444
|
+
system_prompt=self.prompts["question_entity_ranker"],
|
|
445
|
+
user_prompt=f"Question: {question}\nEntities: {entity_list}",
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
def identify_relevant_entities_wo_types(
|
|
449
|
+
self, question: str, entity_list: List[str]
|
|
450
|
+
) -> List[str]:
|
|
451
|
+
"""Identify entities relevant to a question."""
|
|
452
|
+
return self.get_completion(
|
|
453
|
+
system_prompt=self.prompts["question_entity_ranker_wo_types"],
|
|
454
|
+
user_prompt=f"Question: {question}\nEntities: {entity_list}",
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
def answer_question(self, question: str, triplets: List[dict]) -> str:
|
|
458
|
+
"""Answer a question using knowledge graph triplets."""
|
|
459
|
+
return self.get_completion(
|
|
460
|
+
system_prompt=self.prompts["qa"],
|
|
461
|
+
user_prompt=f'Question: {question}\n\nTriplets: "{triplets}"',
|
|
462
|
+
transform_to_json=False,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
def collapse_question(
|
|
466
|
+
self, original_question: str, question: str, answer: str
|
|
467
|
+
) -> str:
|
|
468
|
+
"""Collapse a question using knowledge graph triplets."""
|
|
469
|
+
return self.get_completion(
|
|
470
|
+
system_prompt=self.prompts["qa_collapsing"],
|
|
471
|
+
user_prompt=f"Original multi-hop question: {original_question}\n\Answered sub-question: {question}\n\Answer: {answer}",
|
|
472
|
+
transform_to_json=True,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
def decompose_question(self, question: str) -> str:
|
|
476
|
+
"""Decompose a question using knowledge graph triplets."""
|
|
477
|
+
return self.get_completion(
|
|
478
|
+
system_prompt=self.prompts["question_decomposition_1"],
|
|
479
|
+
user_prompt=f"Question: {question}",
|
|
480
|
+
transform_to_json=False,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
def check_if_question_is_answered(
|
|
484
|
+
self, question: str, subquestions: List[str], answers: List[str]
|
|
485
|
+
) -> str:
|
|
486
|
+
"""Check if a question is answered."""
|
|
487
|
+
user_prompt = (
|
|
488
|
+
f"Original multi-hop question: {question}\nQuestion->answer sequence:\n"
|
|
489
|
+
)
|
|
490
|
+
for question, answer in zip(subquestions, answers):
|
|
491
|
+
user_prompt += f"{question} -> {answer}\n"
|
|
492
|
+
return self.get_completion(
|
|
493
|
+
system_prompt=self.prompts["qa_is_answered"],
|
|
494
|
+
user_prompt=user_prompt,
|
|
495
|
+
transform_to_json=False,
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
def calculate_cost(self) -> float:
|
|
499
|
+
"""Calculate the total cost of API usage."""
|
|
500
|
+
return self.current_cost / 1e6
|
|
501
|
+
|
|
502
|
+
def calculate_used_tokens(self) -> int:
|
|
503
|
+
"""Calculate the total # of used tokens for generation"""
|
|
504
|
+
return self.prompt_tokens_num, self.completion_tokens_num
|
|
505
|
+
|
|
506
|
+
def reset_tokens(self):
|
|
507
|
+
"""Reset the total # of used tokens for generation"""
|
|
508
|
+
self.prompt_tokens_num = 0
|
|
509
|
+
self.completion_tokens_num = 0
|
|
510
|
+
|
|
511
|
+
def reset_messages(self):
|
|
512
|
+
"""Reset the messages"""
|
|
513
|
+
self.messages = []
|
|
514
|
+
|
|
515
|
+
def reset_error_state(self):
|
|
516
|
+
self._prev_error = None
|
|
517
|
+
self._refine_attempt = 0
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
In the previous step, there was extracted a triplet akin to one in Wikidata knowledge graph from the text.
|
|
2
|
+
Triplet contains two entities (subject and object) and one relation that connects these subject and object.
|
|
3
|
+
Using semantic similarity, we linked relation name with top similar exact names from the knowledge graph built from previously seen texts.
|
|
4
|
+
|
|
5
|
+
You will be provided with the following:
|
|
6
|
+
|
|
7
|
+
Text: The original sentence or passage from which the triplet was extracted.
|
|
8
|
+
Extracted Triplet: A structured representation in the format { "subject": "...", "relation": "...", "object": "..." }.
|
|
9
|
+
Original relation: A relation (or in other words property) name from triplet that needs refinement.
|
|
10
|
+
Candidate relations: A list of possible relation (property) names from previously seen texts.
|
|
11
|
+
|
|
12
|
+
Your Task:
|
|
13
|
+
Select the most contextually appropriate relation name from the Candidate relations list that best matches relation from extracted triplet and context of the given Text.
|
|
14
|
+
|
|
15
|
+
- If an exact or semantically appropriate match for relation name is found, return the corresponding name exactly as it appears in the list.
|
|
16
|
+
- If no suitable match for relation exists, return the string "None".
|
|
17
|
+
- Do not modify name from the cancidate list in case of match, add explanations, or provide any additional text.
|
wikontic/utils/prompts/name_refinement/prompt_choose_relation_wo_entity_types_dialog_bench.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
In the previous step, there was extracted a triplet akin to one in Wikidata knowledge graph from the text of user-assistant dialog.
|
|
2
|
+
Triplet contains two entities (subject and object) and one relation that connects these subject and object.
|
|
3
|
+
Using semantic similarity, we linked relation name with top similar exact names from the knowledge graph built from previously seen texts.
|
|
4
|
+
|
|
5
|
+
You will be provided with the following:
|
|
6
|
+
|
|
7
|
+
Text: The original sentence or passage from user-assistant dialog from which the triplet was extracted.
|
|
8
|
+
Extracted Triplet: A structured representation in the format { "subject": "...", "relation": "...", "object": "..." }.
|
|
9
|
+
Original relation: A relation (or in other words property) name from triplet that needs refinement.
|
|
10
|
+
Candidate relations: A list of possible relation (property) names from previously seen texts.
|
|
11
|
+
|
|
12
|
+
Your Task:
|
|
13
|
+
Select the most contextually appropriate relation name from the Candidate relations list that best matches relation from extracted triplet and context of the given Text.
|
|
14
|
+
|
|
15
|
+
- If an exact or semantically appropriate match for relation name is found, return the corresponding name exactly as it appears in the list.
|
|
16
|
+
- If no suitable match for relation exists, return the string "None".
|
|
17
|
+
- Do not modify name from the cancidate list in case of match, add explanations, or provide any additional text.
|
|
18
|
+
- Preserve language of the original text (particularly, Russian) for the name of relation!
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
In the previous step, there was extracted a triplet akin to one in Wikidata knowledge graph from the text.
|
|
2
|
+
Triplet contains two entities (subject and object) and one relation that connects these subject and object.
|
|
3
|
+
Using semantic similarity, we linked object name with top similar exact names from the knowledge graph built from previously seen texts.
|
|
4
|
+
|
|
5
|
+
You will be provided with the following:
|
|
6
|
+
|
|
7
|
+
Text: The original sentence or passage from which the triplet was extracted.
|
|
8
|
+
Extracted Triplet: A structured representation in the format { "subject": "...", "relation": "...", "object": "..." }.
|
|
9
|
+
Original Object: An object name that needs refinement.
|
|
10
|
+
Candidate Objects: A list of possible entity names from previously seen texts.
|
|
11
|
+
|
|
12
|
+
Your Task:
|
|
13
|
+
Select the most contextually appropriate object name from the Candidate Object list that best matches object from extracted triplet and context of the given Text.
|
|
14
|
+
|
|
15
|
+
- If an exact or semantically appropriate match for object name is found, return the corresponding name exactly as it appears in the list.
|
|
16
|
+
- If no suitable match for object exists, return the string "None".
|
|
17
|
+
- Do not modify name from the cancidate list in case of match, add explanations, or provide any additional text.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
In the previous step, there was extracted a triplet akin to one in Wikidata knowledge graph from the text of user-assistant dialog.
|
|
2
|
+
Triplet contains two entities (subject and object) and one relation that connects these subject and object.
|
|
3
|
+
Using semantic similarity, we linked object name with top similar exact names from the knowledge graph built from previously seen texts.
|
|
4
|
+
|
|
5
|
+
You will be provided with the following:
|
|
6
|
+
|
|
7
|
+
Text: The original sentence or passage from user-assistant dialog from which the triplet was extracted.
|
|
8
|
+
Extracted Triplet: A structured representation in the format { "subject": "...", "relation": "...", "object": "..." }.
|
|
9
|
+
Original Object: An object name that needs refinement.
|
|
10
|
+
Candidate Objects: A list of possible entity names from previously seen texts.
|
|
11
|
+
|
|
12
|
+
Your Task:
|
|
13
|
+
Select the most contextually appropriate object name from the Candidate Object list that best matches object from extracted triplet and context of the given Text.
|
|
14
|
+
|
|
15
|
+
- If an exact or semantically appropriate match for object name is found, return the corresponding name exactly as it appears in the list.
|
|
16
|
+
- If no suitable match for object exists, return the string "None".
|
|
17
|
+
- Do not modify name from the cancidate list in case of match, add explanations, or provide any additional text.
|
|
18
|
+
- Preserve language of the original text (particularly, Russian) for the name of object!
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
In the previous step, there was extracted a triplet akin to one in Wikidata knowledge graph from the text.
|
|
2
|
+
Triplet contains two entities (subject and object) and one relation that connects these subject and object.
|
|
3
|
+
Triplets also has **qualifier** that provide more context (e.g., date, place, or other attributes). Qualifiers should have relations and object like triplets do, but instead of subject their relation connects an object and the triplet qualifier belongs to. **Qualifiers must always be attached to a triplet** and never exist as standalone triplets.
|
|
4
|
+
|
|
5
|
+
Using semantic similarity, we linked object name from qualifier with top similar exact names from the knowledge graph built from previously seen texts.
|
|
6
|
+
|
|
7
|
+
You will be provided with the following:
|
|
8
|
+
|
|
9
|
+
Text: The original sentence or passage from which the triplet was extracted.
|
|
10
|
+
Extracted Triplet: A structured representation in the format:
|
|
11
|
+
{"subject": "...", "relation": "...", "object": "...", "qualifier": {"relation": "...", "object": "..."}}.
|
|
12
|
+
Original Object: An object name that needs refinement.
|
|
13
|
+
Candidate Objects: A list of possible entity names from previously seen texts.
|
|
14
|
+
|
|
15
|
+
Your Task:
|
|
16
|
+
Select the most contextually appropriate object name from the Candidate Object list that best matches object from extracted triplet and context of the given Text.
|
|
17
|
+
|
|
18
|
+
- If an exact or semantically appropriate match for object name is found, return the corresponding name exactly as it appears in the list.
|
|
19
|
+
- If no suitable match for object exists, return the string "None".
|
|
20
|
+
- Do not modify name from the cancidate list in case of match, add explanations, or provide any additional text.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
In the previous step, there was extracted a triplet akin to one in Wikidata knowledge graph from the text.
|
|
2
|
+
Triplet contains two entities (subject and object) and one relation that connects these subject and object.
|
|
3
|
+
Using semantic similarity, we linked subject name with top similar exact names from the knowledge graph built from previously seen texts.
|
|
4
|
+
|
|
5
|
+
You will be provided with the following:
|
|
6
|
+
|
|
7
|
+
Text: The original sentence or passage from which the triplet was extracted.
|
|
8
|
+
Extracted Triplet: A structured representation in the format { "subject": "...", "relation": "...", "object": "..." }.
|
|
9
|
+
Original Subject: A subject name that needs refinement.
|
|
10
|
+
Candidate Subjects: A list of possible entity names from previously seen texts.
|
|
11
|
+
|
|
12
|
+
Your Task:
|
|
13
|
+
Select the most contextually appropriate subject name from the Candidate Subjects list that best matches subject from extracted triplet and context of the given Text.
|
|
14
|
+
|
|
15
|
+
- If an exact or semantically appropriate match is found, return the corresponding name exactly as it appears in the list.
|
|
16
|
+
- If no suitable match exists, return the string "None".
|
|
17
|
+
- Do not modify name from the cancidate list in case of match, add explanations, or provide any additional text.
|
|
18
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
In the previous step, there was extracted a triplet akin to one in Wikidata knowledge graph from the text of user-assistant dialog.
|
|
2
|
+
Triplet contains two entities (subject and object) and one relation that connects these subject and object.
|
|
3
|
+
Using semantic similarity, we linked subject name with top similar exact names from the knowledge graph built from previously seen texts.
|
|
4
|
+
|
|
5
|
+
You will be provided with the following:
|
|
6
|
+
|
|
7
|
+
Text: The original sentence or passage from user-assistant dialog from which the triplet was extracted.
|
|
8
|
+
Extracted Triplet: A structured representation in the format { "subject": "...", "relation": "...", "object": "..." }.
|
|
9
|
+
Original Subject: A subject name that needs refinement.
|
|
10
|
+
Candidate Subjects: A list of possible entity names from previously seen texts.
|
|
11
|
+
|
|
12
|
+
Your Task:
|
|
13
|
+
Select the most contextually appropriate subject name from the Candidate Subjects list that best matches subject from extracted triplet and context of the given Text.
|
|
14
|
+
|
|
15
|
+
- If an exact or semantically appropriate match is found, return the corresponding name exactly as it appears in the list.
|
|
16
|
+
- If no suitable match exists, return the string "None".
|
|
17
|
+
- Do not modify name from the cancidate list in case of match, add explanations, or provide any additional text.
|
|
18
|
+
- Preserve language of the original text (particularly, Russian) for the name of subject!
|
|
19
|
+
|
|
20
|
+
|