llm-ie 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_ie/__init__.py +5 -4
- llm_ie/asset/default_prompts/LLMUnitChunker_user_prompt.txt +129 -0
- llm_ie/chunkers.py +145 -6
- llm_ie/data_types.py +23 -37
- llm_ie/engines.py +621 -61
- llm_ie/extractors.py +341 -297
- llm_ie/prompt_editor.py +9 -32
- llm_ie/utils.py +95 -0
- {llm_ie-1.2.2.dist-info → llm_ie-1.2.4.dist-info}/METADATA +1 -1
- {llm_ie-1.2.2.dist-info → llm_ie-1.2.4.dist-info}/RECORD +11 -9
- {llm_ie-1.2.2.dist-info → llm_ie-1.2.4.dist-info}/WHEEL +0 -0
llm_ie/prompt_editor.py
CHANGED
|
@@ -2,6 +2,7 @@ import sys
|
|
|
2
2
|
import warnings
|
|
3
3
|
from typing import List, Dict, Generator
|
|
4
4
|
import importlib.resources
|
|
5
|
+
from llm_ie.utils import apply_prompt_template
|
|
5
6
|
from llm_ie.engines import InferenceEngine
|
|
6
7
|
from llm_ie.extractors import FrameExtractor
|
|
7
8
|
import re
|
|
@@ -45,30 +46,6 @@ class PromptEditor:
|
|
|
45
46
|
|
|
46
47
|
# internal memory (history messages) for the `chat` method
|
|
47
48
|
self.messages = []
|
|
48
|
-
|
|
49
|
-
def _apply_prompt_template(self, text_content:Dict[str,str], prompt_template:str) -> str:
|
|
50
|
-
"""
|
|
51
|
-
This method applies text_content to prompt_template and returns a prompt.
|
|
52
|
-
|
|
53
|
-
Parameters
|
|
54
|
-
----------
|
|
55
|
-
text_content : Dict[str,str]
|
|
56
|
-
the input text content to put in prompt template.
|
|
57
|
-
all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
58
|
-
|
|
59
|
-
Returns : str
|
|
60
|
-
a prompt.
|
|
61
|
-
"""
|
|
62
|
-
pattern = re.compile(r'{{(.*?)}}')
|
|
63
|
-
placeholders = pattern.findall(prompt_template)
|
|
64
|
-
if len(placeholders) != len(text_content):
|
|
65
|
-
raise ValueError(f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size.")
|
|
66
|
-
if not all([k in placeholders for k, _ in text_content.items()]):
|
|
67
|
-
raise ValueError(f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders}).")
|
|
68
|
-
|
|
69
|
-
prompt = pattern.sub(lambda match: re.sub(r'\\', r'\\\\', text_content[match.group(1)]), prompt_template)
|
|
70
|
-
|
|
71
|
-
return prompt
|
|
72
49
|
|
|
73
50
|
|
|
74
51
|
def rewrite(self, draft:str) -> str:
|
|
@@ -80,8 +57,8 @@ class PromptEditor:
|
|
|
80
57
|
with open(file_path, 'r') as f:
|
|
81
58
|
rewrite_prompt_template = f.read()
|
|
82
59
|
|
|
83
|
-
prompt =
|
|
84
|
-
|
|
60
|
+
prompt = apply_prompt_template(prompt_template=rewrite_prompt_template,
|
|
61
|
+
text_content={"draft": draft, "prompt_guideline": self.prompt_guide})
|
|
85
62
|
messages = [{"role": "system", "content": self.system_prompt},
|
|
86
63
|
{"role": "user", "content": prompt}]
|
|
87
64
|
res = self.inference_engine.chat(messages, verbose=True)
|
|
@@ -96,8 +73,8 @@ class PromptEditor:
|
|
|
96
73
|
with open(file_path, 'r') as f:
|
|
97
74
|
comment_prompt_template = f.read()
|
|
98
75
|
|
|
99
|
-
prompt =
|
|
100
|
-
|
|
76
|
+
prompt = apply_prompt_template(prompt_template=comment_prompt_template,
|
|
77
|
+
text_content={"draft": draft, "prompt_guideline": self.prompt_guide})
|
|
101
78
|
messages = [{"role": "system", "content": self.system_prompt},
|
|
102
79
|
{"role": "user", "content": prompt}]
|
|
103
80
|
res = self.inference_engine.chat(messages, verbose=True)
|
|
@@ -254,8 +231,8 @@ class PromptEditor:
|
|
|
254
231
|
with open(file_path, 'r') as f:
|
|
255
232
|
chat_prompt_template = f.read()
|
|
256
233
|
|
|
257
|
-
guideline =
|
|
258
|
-
|
|
234
|
+
guideline = apply_prompt_template(prompt_template=chat_prompt_template,
|
|
235
|
+
text_content={"prompt_guideline": self.prompt_guide})
|
|
259
236
|
|
|
260
237
|
self.messages = [{"role": "system", "content": self.system_prompt + guideline}]
|
|
261
238
|
|
|
@@ -288,8 +265,8 @@ class PromptEditor:
|
|
|
288
265
|
with open(file_path, 'r') as f:
|
|
289
266
|
chat_prompt_template = f.read()
|
|
290
267
|
|
|
291
|
-
guideline =
|
|
292
|
-
|
|
268
|
+
guideline = apply_prompt_template(prompt_template=chat_prompt_template,
|
|
269
|
+
text_content={"prompt_guideline": self.prompt_guide})
|
|
293
270
|
|
|
294
271
|
messages = [{"role": "system", "content": self.system_prompt + guideline}] + messages
|
|
295
272
|
|
llm_ie/utils.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import List, Dict, Union
|
|
2
|
+
import re
|
|
3
|
+
import json
|
|
4
|
+
import warnings
|
|
5
|
+
import json_repair
|
|
6
|
+
|
|
7
|
+
def _find_dict_strings(text: str) -> List[str]:
|
|
8
|
+
"""
|
|
9
|
+
Extracts balanced JSON-like dictionaries from a string, even if nested.
|
|
10
|
+
|
|
11
|
+
Parameters:
|
|
12
|
+
-----------
|
|
13
|
+
text : str
|
|
14
|
+
the input text containing JSON-like structures.
|
|
15
|
+
|
|
16
|
+
Returns : List[str]
|
|
17
|
+
A list of valid JSON-like strings representing dictionaries.
|
|
18
|
+
"""
|
|
19
|
+
open_brace = 0
|
|
20
|
+
start = -1
|
|
21
|
+
json_objects = []
|
|
22
|
+
|
|
23
|
+
for i, char in enumerate(text):
|
|
24
|
+
if char == '{':
|
|
25
|
+
if open_brace == 0:
|
|
26
|
+
# start of a new JSON object
|
|
27
|
+
start = i
|
|
28
|
+
open_brace += 1
|
|
29
|
+
elif char == '}':
|
|
30
|
+
open_brace -= 1
|
|
31
|
+
if open_brace == 0 and start != -1:
|
|
32
|
+
json_objects.append(text[start:i + 1])
|
|
33
|
+
start = -1
|
|
34
|
+
|
|
35
|
+
return json_objects
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def extract_json(gen_text:str) -> List[Dict[str, str]]:
|
|
39
|
+
"""
|
|
40
|
+
This method inputs a generated text and output a JSON of information tuples
|
|
41
|
+
"""
|
|
42
|
+
out = []
|
|
43
|
+
dict_str_list = _find_dict_strings(gen_text)
|
|
44
|
+
for dict_str in dict_str_list:
|
|
45
|
+
try:
|
|
46
|
+
dict_obj = json.loads(dict_str)
|
|
47
|
+
out.append(dict_obj)
|
|
48
|
+
except json.JSONDecodeError:
|
|
49
|
+
dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
|
|
50
|
+
if dict_obj:
|
|
51
|
+
warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
|
|
52
|
+
out.append(dict_obj)
|
|
53
|
+
else:
|
|
54
|
+
warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
|
|
55
|
+
return out
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def apply_prompt_template(prompt_template:str, text_content:Union[str, Dict[str,str]]) -> str:
|
|
59
|
+
"""
|
|
60
|
+
This method applies text_content to prompt_template and returns a prompt.
|
|
61
|
+
|
|
62
|
+
Parameters:
|
|
63
|
+
----------
|
|
64
|
+
prompt_template : str
|
|
65
|
+
the prompt template with placeholders {{<placeholder name>}}.
|
|
66
|
+
text_content : Union[str, Dict[str,str]]
|
|
67
|
+
the input text content to put in prompt template.
|
|
68
|
+
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
69
|
+
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}. All values must be str.
|
|
70
|
+
|
|
71
|
+
Returns : str
|
|
72
|
+
a user prompt.
|
|
73
|
+
"""
|
|
74
|
+
pattern = re.compile(r'{{(.*?)}}')
|
|
75
|
+
if isinstance(text_content, str):
|
|
76
|
+
matches = pattern.findall(prompt_template)
|
|
77
|
+
if len(matches) != 1:
|
|
78
|
+
raise ValueError("When text_content is str, the prompt template must has exactly 1 placeholder {{<placeholder name>}}.")
|
|
79
|
+
text = re.sub(r'\\', r'\\\\', text_content)
|
|
80
|
+
prompt = pattern.sub(text, prompt_template)
|
|
81
|
+
|
|
82
|
+
elif isinstance(text_content, dict):
|
|
83
|
+
# Check if all values are str
|
|
84
|
+
if not all([isinstance(v, str) for v in text_content.values()]):
|
|
85
|
+
raise ValueError("All values in text_content must be str.")
|
|
86
|
+
# Check if all keys are in the prompt template
|
|
87
|
+
placeholders = pattern.findall(prompt_template)
|
|
88
|
+
if len(placeholders) != len(text_content):
|
|
89
|
+
raise ValueError(f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size.")
|
|
90
|
+
if not all([k in placeholders for k, _ in text_content.items()]):
|
|
91
|
+
raise ValueError(f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders}).")
|
|
92
|
+
|
|
93
|
+
prompt = pattern.sub(lambda match: re.sub(r'\\', r'\\\\', text_content[match.group(1)]), prompt_template)
|
|
94
|
+
|
|
95
|
+
return prompt
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llm-ie
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.4
|
|
4
4
|
Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
llm_ie/__init__.py,sha256=
|
|
1
|
+
llm_ie/__init__.py,sha256=9a0bTN2ol5k_rCEidhnqIwJCnVTfit7TbTtbWG4hj1s,1881
|
|
2
2
|
llm_ie/asset/PromptEditor_prompts/chat.txt,sha256=Fq62voV0JQ8xBRcxS1Nmdd7DkHs1fGYb-tmNwctZZK0,118
|
|
3
3
|
llm_ie/asset/PromptEditor_prompts/comment.txt,sha256=C_lxx-dlOlFJ__jkHKosZ8HsNAeV1aowh2B36nIipBY,159
|
|
4
4
|
llm_ie/asset/PromptEditor_prompts/rewrite.txt,sha256=JAwY9vm1jSmKf2qcLBYUvrSmME2EJH36bALmkwZDWYQ,178
|
|
5
5
|
llm_ie/asset/PromptEditor_prompts/system.txt,sha256=QwGTIJvp-5u2P8CkGt_rabttlN1puHQwIBNquUm1ZHo,730
|
|
6
6
|
llm_ie/asset/default_prompts/BasicReviewFrameExtractor_addition_review_prompt.txt,sha256=pKes8BOAoJJgmo_IQh2ISKiMh_rDPl_rDUU_VgDQ4o4,273
|
|
7
7
|
llm_ie/asset/default_prompts/BasicReviewFrameExtractor_revision_review_prompt.txt,sha256=9Nwkr2U_3ZSk01xDtgiFJVABi6FkC8Izdq7zrzFfLRg,235
|
|
8
|
+
llm_ie/asset/default_prompts/LLMUnitChunker_user_prompt.txt,sha256=tf9tu9FvNFpp26J7S39bJLuiI5R47bapDdEplvvbJU4,4203
|
|
8
9
|
llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt,sha256=NLEtnmx1aOsnwifAsXr65pX9WdrIWdx-MJ7aMtNKi8c,331
|
|
9
10
|
llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt,sha256=lGGjdeFpzZEc56w-EtQDMyYFs7A3DQAM32sT42Nf_08,293
|
|
10
11
|
llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt,sha256=Of11LFuXLB249oekFelzlIeoAB0cATReqWgFTvhNz_8,329
|
|
@@ -18,11 +19,12 @@ llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt,sha256=EQ
|
|
|
18
19
|
llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt,sha256=rBRIXg8JQWUHTRdoluTS0zkbTkBAacEtHHvr3lZaQCw,10437
|
|
19
20
|
llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=97_-y_vHMLG4Kb8fLsGgibLxB-3mest8k3LHfLo5h-I,10465
|
|
20
21
|
llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=97_-y_vHMLG4Kb8fLsGgibLxB-3mest8k3LHfLo5h-I,10465
|
|
21
|
-
llm_ie/chunkers.py,sha256=
|
|
22
|
-
llm_ie/data_types.py,sha256=
|
|
23
|
-
llm_ie/engines.py,sha256=
|
|
24
|
-
llm_ie/extractors.py,sha256=
|
|
25
|
-
llm_ie/prompt_editor.py,sha256=
|
|
26
|
-
llm_ie
|
|
27
|
-
llm_ie-1.2.
|
|
28
|
-
llm_ie-1.2.
|
|
22
|
+
llm_ie/chunkers.py,sha256=b4APRwaLMU40QXVEhOK8m1DZi_jr-VCHAFwbMjqVBgA,11308
|
|
23
|
+
llm_ie/data_types.py,sha256=6vefyGTgZcJBYgiuyfcbJN1ZKK4tNvOZf6HFpxFZngY,17792
|
|
24
|
+
llm_ie/engines.py,sha256=K4Zgb1dYiuopBeTLcgSAseI-VXgwtTeWf9O4EK9SQqE,63901
|
|
25
|
+
llm_ie/extractors.py,sha256=f-TUZFprJZ_ftrnKbi-g-au4KoJwtciCCawXHWzmDtU,100792
|
|
26
|
+
llm_ie/prompt_editor.py,sha256=Hqukm2HMgsoGpXV3vZ__7CGgfMhd-UUIwTKGnfSDltM,12055
|
|
27
|
+
llm_ie/utils.py,sha256=k6M4l8GsKOMcmO6UwONQ353Zk-TeoBj6HXGjlAn-JE0,3679
|
|
28
|
+
llm_ie-1.2.4.dist-info/METADATA,sha256=dl0JyDkgjEbk12N5I1fZg-jh7gEvTpuJ1Ox1_mHo_6Q,728
|
|
29
|
+
llm_ie-1.2.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
30
|
+
llm_ie-1.2.4.dist-info/RECORD,,
|
|
File without changes
|