llm-ie 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py ADDED
@@ -0,0 +1,496 @@
1
+ import os
2
+ import abc
3
+ import re
4
+ from typing import List, Dict, Tuple, Union
5
+ from llm_ie.data_types import LLMInformationExtractionFrame
6
+ from llm_ie.engines import InferenceEngine
7
+
8
+
9
+ class FrameExtractor:
10
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
11
+ """
12
+ This is the abstract class for frame extraction.
13
+ Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
14
+
15
+ Parameters
16
+ ----------
17
+ inference_engine : InferenceEngine
18
+ the LLM inferencing engine object. Must implements the chat() method.
19
+ prompt_template : str
20
+ prompt template with "{{<placeholder name>}}" placeholder.
21
+ system_prompt : str, Optional
22
+ system prompt.
23
+ """
24
+ self.inference_engine = inference_engine
25
+ self.prompt_template = prompt_template
26
+ self.system_prompt = system_prompt
27
+
28
+ @classmethod
29
+ def get_prompt_guide(cls) -> str:
30
+ with open(os.path.join('/home/daviden1013/David_projects/llm-ie', 'asset', 'prompt_guide', f"{cls.__name__}_prompt_guide.txt"), 'r') as f:
31
+ return f.read()
32
+
33
+ def _get_user_prompt(self, text_content:Union[str, Dict[str,str]]) -> str:
34
+ """
35
+ This method applies text_content to prompt_template and returns a prompt.
36
+
37
+ Parameters
38
+ ----------
39
+ text_content : Union[str, Dict[str,str]]
40
+ the input text content to put in prompt template.
41
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
42
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
43
+
44
+ Returns : str
45
+ a user prompt.
46
+ """
47
+ pattern = re.compile(r'{{(.*?)}}')
48
+ if isinstance(text_content, str):
49
+ matches = pattern.findall(self.prompt_template)
50
+ assert len(matches) == 1, \
51
+ "When text_content is str, the prompt template must has only 1 placeholder {{<placeholder name>}}."
52
+ prompt = pattern.sub(text_content, self.prompt_template)
53
+
54
+ elif isinstance(text_content, dict):
55
+ placeholders = pattern.findall(self.prompt_template)
56
+ assert len(placeholders) == len(text_content), \
57
+ f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size."
58
+ assert all([k in placeholders for k, _ in text_content.items()]), \
59
+ f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders})."
60
+
61
+ prompt = pattern.sub(lambda match: text_content[match.group(1)], self.prompt_template)
62
+
63
+ return prompt
64
+
65
+ def _extract_json(self, gen_text:str) -> List[Dict[str, str]]:
66
+ """
67
+ This method inputs a generated text and output a JSON of information tuples
68
+ """
69
+ pattern = r'\{.*?\}'
70
+ out = []
71
+ for tup in re.findall(pattern, gen_text):
72
+ try:
73
+ tup_dict = eval(tup)
74
+ out.append(tup_dict)
75
+ except:
76
+ print(f'Post-processing failed at:\n{tup}')
77
+ return out
78
+
79
+
80
+ def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False) -> List[Tuple[int]]:
81
+ """
82
+ This function inputs a text and a list of entity text,
83
+ outputs a list of spans (2-tuple) for each entity.
84
+ Entities that are not found in the text will be None from output.
85
+
86
+ Parameters
87
+ ----------
88
+ text : str
89
+ text that contains entities
90
+ """
91
+ entity_spans = []
92
+ for entity in entities:
93
+ if case_sensitive:
94
+ match = re.search(re.escape(entity), text)
95
+ else:
96
+ match = re.search(re.escape(entity), text, re.IGNORECASE)
97
+
98
+ if match:
99
+ start, end = match.span()
100
+ entity_spans.append((start, end))
101
+ # Replace the found entity with spaces to avoid finding the same instance again
102
+ text = text[:start] + ' ' * (end - start) + text[end:]
103
+ else:
104
+ entity_spans.append(None)
105
+
106
+ return entity_spans
107
+
108
+ @abc.abstractmethod
109
+ def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, **kwrs) -> str:
110
+ """
111
+ This method inputs text content and outputs a string generated by LLM
112
+
113
+ Parameters:
114
+ ----------
115
+ text_content : Union[str, Dict[str,str]]
116
+ the input text content to put in prompt template.
117
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
118
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
119
+ max_new_tokens : str, Optional
120
+ the max number of new tokens LLM can generate.
121
+
122
+ Return : str
123
+ the output from LLM. Need post-processing.
124
+ """
125
+ return NotImplemented
126
+
127
+
128
+ @abc.abstractmethod
129
+ def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
130
+ document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
131
+ """
132
+ This method inputs text content and outputs a list of LLMInformationExtractionFrame
133
+ It use the extract() method and post-process outputs into frames.
134
+
135
+ Parameters:
136
+ ----------
137
+ text_content : Union[str, Dict[str,str]]
138
+ the input text content to put in prompt template.
139
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name
140
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
141
+ entity_key : str
142
+ the key (in ouptut JSON) for entity text. Any extraction that does not include entity key will be dropped.
143
+ max_new_tokens : str, Optional
144
+ the max number of new tokens LLM should generate.
145
+ document_key : str, Optional
146
+ specify the key in text_content where document text is.
147
+ If text_content is str, this parameter will be ignored.
148
+
149
+ Return : str
150
+ a list of frames.
151
+ """
152
+ return NotImplemented
153
+
154
+
155
+ class BasicFrameExtractor(FrameExtractor):
156
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
157
+ """
158
+ This class diretly prompt LLM for frame extraction.
159
+ Input system prompt (optional), prompt template (with instruction, few-shot examples),
160
+ and specify a LLM.
161
+
162
+ Parameters
163
+ ----------
164
+ inference_engine : InferenceEngine
165
+ the LLM inferencing engine object. Must implements the chat() method.
166
+ prompt_template : str
167
+ prompt template with "{{<placeholder name>}}" placeholder.
168
+ system_prompt : str, Optional
169
+ system prompt.
170
+ """
171
+ super().__init__(inference_engine=inference_engine,
172
+ prompt_template=prompt_template,
173
+ system_prompt=system_prompt,
174
+ **kwrs)
175
+
176
+
177
+ def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048,
178
+ temperature:float=0.0, stream:bool=False, **kwrs) -> str:
179
+ """
180
+ This method inputs a text and outputs a string generated by LLM.
181
+
182
+ Parameters:
183
+ ----------
184
+ text_content : Union[str, Dict[str,str]]
185
+ the input text content to put in prompt template.
186
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
187
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
188
+ max_new_tokens : str, Optional
189
+ the max number of new tokens LLM can generate.
190
+ temperature : float, Optional
191
+ the temperature for token sampling.
192
+ stream : bool, Optional
193
+ if True, LLM generated text will be printed in terminal in real-time.
194
+
195
+ Return : str
196
+ the output from LLM. Need post-processing.
197
+ """
198
+ response = self.inference_engine.chat(
199
+ messages=[{'role': 'system', 'content': self.system_prompt},
200
+ {'role': 'user', 'content': self._get_user_prompt(text_content)}],
201
+ max_new_tokens=max_new_tokens,
202
+ temperature=temperature,
203
+ stream=stream,
204
+ **kwrs
205
+ )
206
+
207
+ return response
208
+
209
+
210
+ def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
211
+ temperature:float=0.0, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
212
+ """
213
+ This method inputs a text and outputs a list of LLMInformationExtractionFrame
214
+ It use the extract() method and post-process outputs into frames.
215
+
216
+ Parameters:
217
+ ----------
218
+ text_content : Union[str, Dict[str,str]]
219
+ the input text content to put in prompt template.
220
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
221
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
222
+ entity_key : str
223
+ the key (in ouptut JSON) for entity text. Any extraction that does not include entity key will be dropped.
224
+ max_new_tokens : str, Optional
225
+ the max number of new tokens LLM should generate.
226
+ temperature : float, Optional
227
+ the temperature for token sampling.
228
+ document_key : str, Optional
229
+ specify the key in text_content where document text is.
230
+ If text_content is str, this parameter will be ignored.
231
+
232
+ Return : str
233
+ a list of frames.
234
+ """
235
+ frame_list = []
236
+ gen_text = self.extract(text_content=text_content,
237
+ max_new_tokens=max_new_tokens, temperature=temperature, **kwrs)
238
+ entity_json = self._extract_json(gen_text=gen_text)
239
+ if isinstance(text_content, str):
240
+ text = text_content
241
+ elif isinstance(text_content, dict):
242
+ text = text_content[document_key]
243
+
244
+ spans = self._find_entity_spans(text=text,
245
+ entities=[e[entity_key] for e in entity_json],
246
+ case_sensitive=False)
247
+
248
+ for i, (ent, span) in enumerate(zip(entity_json, spans)):
249
+ if span is not None:
250
+ start, end = span
251
+ frame = LLMInformationExtractionFrame(frame_id=f"{i}",
252
+ start=start,
253
+ end=end,
254
+ entity_text=ent[entity_key],
255
+ attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
256
+ frame_list.append(frame)
257
+ return frame_list
258
+
259
+
260
+ class ReviewFrameExtractor(BasicFrameExtractor):
261
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str, review_prompt:str,
262
+ review_mode:str, system_prompt:str=None, **kwrs):
263
+ """
264
+ This class add a review step after the BasicFrameExtractor.
265
+ The Review process asks LLM to review its output and:
266
+ 1. add more frames while keep current. This is efficient for boosting recall.
267
+ 2. or, regenerate frames (add new and delete existing).
268
+ Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
269
+
270
+ Parameters:
271
+ ----------
272
+ inference_engine : InferenceEngine
273
+ the LLM inferencing engine object. Must implements the chat() method.
274
+ prompt_template : str
275
+ prompt template with "{{<placeholder name>}}" placeholder.
276
+ review_prompt : str
277
+ the prompt text that ask LLM to review. Specify addition or revision in the instruction.
278
+ review_mode : str
279
+ review mode. Must be one of {"addition", "revision"}
280
+ addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
281
+ system_prompt : str, Optional
282
+ system prompt.
283
+ """
284
+ super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
285
+ system_prompt=system_prompt, **kwrs)
286
+ self.review_prompt = review_prompt
287
+ assert review_mode in {"addition", "revision"}, 'review_mode must be one of {"addition", "revision"}.'
288
+ self.review_mode = review_mode
289
+
290
+
291
+ def extract(self, text_content:Union[str, Dict[str,str]],
292
+ max_new_tokens:int=4096, temperature:float=0.0, stream:bool=False, **kwrs) -> str:
293
+ """
294
+ This method inputs a text and outputs a string generated by LLM.
295
+
296
+ Parameters:
297
+ ----------
298
+ text_content : Union[str, Dict[str,str]]
299
+ the input text content to put in prompt template.
300
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
301
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
302
+ max_new_tokens : str, Optional
303
+ the max number of new tokens LLM can generate.
304
+ temperature : float, Optional
305
+ the temperature for token sampling.
306
+ stream : bool, Optional
307
+ if True, LLM generated text will be printed in terminal in real-time.
308
+
309
+ Return : str
310
+ the output from LLM. Need post-processing.
311
+ """
312
+ # Pormpt extraction
313
+ messages=[{'role': 'system', 'content': self.system_prompt},
314
+ {'role': 'user', 'content': self._get_user_prompt(text_content)}]
315
+
316
+ initial = self.inference_engine.chat(
317
+ messages=messages,
318
+ max_new_tokens=max_new_tokens,
319
+ temperature=temperature,
320
+ stream=stream,
321
+ **kwrs
322
+ )
323
+
324
+ # Review
325
+ messages.append({'role': 'assistant', 'content': initial})
326
+ messages.append({'role': 'user', 'content': self.review_prompt})
327
+
328
+ review = self.inference_engine.chat(
329
+ messages=messages,
330
+ max_new_tokens=max_new_tokens,
331
+ temperature=temperature,
332
+ stream=stream,
333
+ **kwrs
334
+ )
335
+
336
+ # Output
337
+ if self.review_mode == "revision":
338
+ return review
339
+ elif self.review_mode == "addition":
340
+ return initial + '\n' + review
341
+
342
+
343
+ class SentenceFrameExtractor(FrameExtractor):
344
+ from nltk.tokenize.punkt import PunktSentenceTokenizer
345
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
346
+ """
347
+ This class performs sentence-based information extraction.
348
+ A simulated chat follows this process:
349
+ 1. system prompt (optional)
350
+ 2. user instructions (schema, background, full text, few-shot example...)
351
+ 3. user input first sentence
352
+ 4. assistant extract outputs
353
+ 5. repeat #3 and #4
354
+
355
+ Input system prompt (optional), prompt template (with user instructions),
356
+ and specify a LLM.
357
+
358
+ Parameters
359
+ ----------
360
+ inference_engine : InferenceEngine
361
+ the LLM inferencing engine object. Must implements the chat() method.
362
+ prompt_template : str
363
+ prompt template with "{{<placeholder name>}}" placeholder.
364
+ system_prompt : str, Optional
365
+ system prompt.
366
+ """
367
+ super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
368
+ system_prompt=system_prompt, **kwrs)
369
+
370
+ def _get_sentences(self, text:str) -> List[Dict[str,str]]:
371
+ """
372
+ This method sentence tokenize the input text into a list of sentences
373
+ as dict of {start, end, sentence_text}
374
+
375
+ Parameters
376
+ ----------
377
+ text : str
378
+ text to sentence tokenize.
379
+
380
+ Returns : List[Dict[str,str]]
381
+ a list of sentences as dict with keys: {"sentence_text", "start", "end"}.
382
+ """
383
+ sentences = []
384
+ for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
385
+ sentences.append({"sentence_text": text[start:end],
386
+ "start": start,
387
+ "end": end})
388
+ return sentences
389
+
390
+
391
+ def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
392
+ document_key:str=None, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
393
+ """
394
+ This method inputs a text and outputs a list of outputs per sentence.
395
+
396
+ Parameters:
397
+ ----------
398
+ text_content : Union[str, Dict[str,str]]
399
+ the input text content to put in prompt template.
400
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
401
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
402
+ max_new_tokens : str, Optional
403
+ the max number of new tokens LLM should generate.
404
+ document_key : str, Optional
405
+ specify the key in text_content where document text is.
406
+ If text_content is str, this parameter will be ignored.
407
+ temperature : float, Optional
408
+ the temperature for token sampling.
409
+ stream : bool, Optional
410
+ if True, LLM generated text will be printed in terminal in real-time.
411
+
412
+ Return : str
413
+ the output from LLM. Need post-processing.
414
+ """
415
+ # define output
416
+ output = []
417
+ # sentence tokenization
418
+ if isinstance(text_content, str):
419
+ sentences = self._get_sentences(text_content)
420
+ elif isinstance(text_content, dict):
421
+ sentences = self._get_sentences(text_content[document_key])
422
+ # construct chat messages
423
+ messages = []
424
+ if self.system_prompt:
425
+ messages.append({'role': 'system', 'content': self.system_prompt})
426
+
427
+ messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
428
+ messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
429
+
430
+ # generate sentence by sentence
431
+ for sent in sentences:
432
+ messages.append({'role': 'user', 'content': sent['sentence_text']})
433
+ if stream:
434
+ print(f"\n\nSentence: \n{sent['sentence_text']}\n")
435
+ print("Extraction:")
436
+
437
+ gen_text = self.inference_engine.chat(
438
+ messages=messages,
439
+ max_new_tokens=max_new_tokens,
440
+ temperature=temperature,
441
+ stream=stream,
442
+ **kwrs
443
+ )
444
+
445
+ # update chat messages
446
+ messages.append({'role': 'assistant', 'content': gen_text})
447
+ # add to output
448
+ output.append({'sentence_start': sent['start'],
449
+ 'sentence_end': sent['end'],
450
+ 'sentence_text': sent['sentence_text'],
451
+ 'gen_text': gen_text})
452
+ return output
453
+
454
+
455
+ def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str,
456
+ max_new_tokens:int=512, document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
457
+ """
458
+ This method inputs a text and outputs a list of LLMInformationExtractionFrame
459
+ It use the extract() method and post-process outputs into frames.
460
+
461
+ Parameters:
462
+ ----------
463
+ text_content : Union[str, Dict[str,str]]
464
+ the input text content to put in prompt template.
465
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
466
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
467
+ entity_key : str
468
+ the key (in ouptut JSON) for entity text.
469
+ max_new_tokens : str, Optional
470
+ the max number of new tokens LLM should generate.
471
+ document_key : str, Optional
472
+ specify the key in text_content where document text is.
473
+ If text_content is str, this parameter will be ignored.
474
+
475
+ Return : str
476
+ a list of frames.
477
+ """
478
+ llm_output_sentence = self.extract(text_content=text_content,
479
+ max_new_tokens=max_new_tokens, document_key=document_key, **kwrs)
480
+ frame_list = []
481
+ for sent in llm_output_sentence:
482
+ entity_json = self._extract_json(gen_text=sent['gen_text'])
483
+ spans = self._find_entity_spans(text=sent['sentence_text'],
484
+ entities=[e[entity_key] for e in entity_json], case_sensitive=False)
485
+ for ent, span in zip(entity_json, spans):
486
+ if span is not None:
487
+ start, end = span
488
+ start += sent['sentence_start']
489
+ end += sent['sentence_start']
490
+ frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
491
+ start=start,
492
+ end=end,
493
+ entity_text=ent[entity_key],
494
+ attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
495
+ frame_list.append(frame)
496
+ return frame_list
@@ -0,0 +1,26 @@
1
+ import os
2
+ from llm_ie.engines import InferenceEngine
3
+ from llm_ie.extractors import FrameExtractor
4
+
5
+ class PromptEditor:
6
+ def __init__(self, inference_engine:InferenceEngine, extractor:FrameExtractor):
7
+ self.inference_engine = inference_engine
8
+ self.prompt_guide = extractor.get_prompt_guide()
9
+
10
+ def rewrite(self, draft:str) -> str:
11
+ with open(os.path.join('/home/daviden1013/David_projects/llm-ie', 'asset', 'PromptEditor_prompts', 'rewrite.txt'), 'r') as f:
12
+ prompt = f.read()
13
+
14
+ prompt = prompt.replace("{{draft}}", draft).replace("{{prompt_guideline}}", self.prompt_guide)
15
+ messages = [{"role": "user", "content": prompt}]
16
+ res = self.inference_engine.chat(messages, stream=True)
17
+ return res
18
+
19
+ def comment(self, draft:str) -> str:
20
+ with open(os.path.join('/home/daviden1013/David_projects/llm-ie', 'asset', 'PromptEditor_prompts', 'comment.txt'), 'r') as f:
21
+ prompt = f.read()
22
+
23
+ prompt = prompt.replace("{{draft}}", draft).replace("{{prompt_guideline}}", self.prompt_guide)
24
+ messages = [{"role": "user", "content": prompt}]
25
+ res = self.inference_engine.chat(messages, stream=True)
26
+ return res