llm-ie 0.4.6__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py CHANGED
@@ -288,7 +288,7 @@ class FrameExtractor(Extractor):
288
288
  return entity_spans
289
289
 
290
290
  @abc.abstractmethod
291
- def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, **kwrs) -> str:
291
+ def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, return_messages_log:bool=False, **kwrs) -> str:
292
292
  """
293
293
  This method inputs text content and outputs a string generated by LLM
294
294
 
@@ -300,6 +300,8 @@ class FrameExtractor(Extractor):
300
300
  If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
301
301
  max_new_tokens : str, Optional
302
302
  the max number of new tokens LLM can generate.
303
+ return_messages_log : bool, Optional
304
+ if True, a list of messages will be returned.
303
305
 
304
306
  Return : str
305
307
  the output from LLM. Need post-processing.
@@ -309,7 +311,7 @@ class FrameExtractor(Extractor):
309
311
 
310
312
  @abc.abstractmethod
311
313
  def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
312
- document_key:str=None, **kwrs) -> List[LLMInformationExtractionFrame]:
314
+ document_key:str=None, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
313
315
  """
314
316
  This method inputs text content and outputs a list of LLMInformationExtractionFrame
315
317
  It use the extract() method and post-process outputs into frames.
@@ -327,6 +329,8 @@ class FrameExtractor(Extractor):
327
329
  document_key : str, Optional
328
330
  specify the key in text_content where document text is.
329
331
  If text_content is str, this parameter will be ignored.
332
+ return_messages_log : bool, Optional
333
+ if True, a list of messages will be returned.
330
334
 
331
335
  Return : str
332
336
  a list of frames.
@@ -357,7 +361,7 @@ class BasicFrameExtractor(FrameExtractor):
357
361
 
358
362
 
359
363
  def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048,
360
- temperature:float=0.0, stream:bool=False, **kwrs) -> str:
364
+ temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> str:
361
365
  """
362
366
  This method inputs a text and outputs a string generated by LLM.
363
367
 
@@ -373,6 +377,8 @@ class BasicFrameExtractor(FrameExtractor):
373
377
  the temperature for token sampling.
374
378
  stream : bool, Optional
375
379
  if True, LLM generated text will be printed in terminal in real-time.
380
+ return_messages_log : bool, Optional
381
+ if True, a list of messages will be returned.
376
382
 
377
383
  Return : str
378
384
  the output from LLM. Need post-processing.
@@ -390,13 +396,19 @@ class BasicFrameExtractor(FrameExtractor):
390
396
  **kwrs
391
397
  )
392
398
 
399
+ if return_messages_log:
400
+ messages.append({"role": "assistant", "content": response})
401
+ messages_log = [messages]
402
+ return response, messages_log
403
+
393
404
  return response
394
405
 
395
406
 
396
407
  def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
397
408
  temperature:float=0.0, document_key:str=None, stream:bool=False,
398
409
  case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
399
- fuzzy_score_cutoff:float=0.8, allow_overlap_entities:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
410
+ fuzzy_score_cutoff:float=0.8, allow_overlap_entities:bool=False,
411
+ return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
400
412
  """
401
413
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
402
414
  It use the extract() method and post-process outputs into frames.
@@ -430,6 +442,8 @@ class BasicFrameExtractor(FrameExtractor):
430
442
  allow_overlap_entities : bool, Optional
431
443
  if True, entities can overlap in the text.
432
444
  Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
445
+ return_messages_log : bool, Optional
446
+ if True, a list of messages will be returned.
433
447
 
434
448
  Return : str
435
449
  a list of frames.
@@ -442,11 +456,13 @@ class BasicFrameExtractor(FrameExtractor):
442
456
  text = text_content[document_key]
443
457
 
444
458
  frame_list = []
445
- gen_text = self.extract(text_content=text_content,
446
- max_new_tokens=max_new_tokens,
447
- temperature=temperature,
448
- stream=stream,
449
- **kwrs)
459
+ extraction_results = self.extract(text_content=text_content,
460
+ max_new_tokens=max_new_tokens,
461
+ temperature=temperature,
462
+ stream=stream,
463
+ return_messages_log=return_messages_log,
464
+ **kwrs)
465
+ gen_text, messages_log = extraction_results if return_messages_log else (extraction_results, None)
450
466
 
451
467
  entity_json = []
452
468
  for entity in self._extract_json(gen_text=gen_text):
@@ -472,6 +488,10 @@ class BasicFrameExtractor(FrameExtractor):
472
488
  entity_text=text[start:end],
473
489
  attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
474
490
  frame_list.append(frame)
491
+
492
+ if return_messages_log:
493
+ return frame_list, messages_log
494
+
475
495
  return frame_list
476
496
 
477
497
 
@@ -518,7 +538,7 @@ class ReviewFrameExtractor(BasicFrameExtractor):
518
538
 
519
539
 
520
540
  def extract(self, text_content:Union[str, Dict[str,str]],
521
- max_new_tokens:int=4096, temperature:float=0.0, stream:bool=False, **kwrs) -> str:
541
+ max_new_tokens:int=4096, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> str:
522
542
  """
523
543
  This method inputs a text and outputs a string generated by LLM.
524
544
 
@@ -534,6 +554,8 @@ class ReviewFrameExtractor(BasicFrameExtractor):
534
554
  the temperature for token sampling.
535
555
  stream : bool, Optional
536
556
  if True, LLM generated text will be printed in terminal in real-time.
557
+ return_messages_log : bool, Optional
558
+ if True, a list of messages will be returned.
537
559
 
538
560
  Return : str
539
561
  the output from LLM. Need post-processing.
@@ -570,10 +592,18 @@ class ReviewFrameExtractor(BasicFrameExtractor):
570
592
  )
571
593
 
572
594
  # Output
595
+ output_text = ""
573
596
  if self.review_mode == "revision":
574
- return review
597
+ output_text = review
575
598
  elif self.review_mode == "addition":
576
- return initial + '\n' + review
599
+ output_text = initial + '\n' + review
600
+
601
+ if return_messages_log:
602
+ messages.append({"role": "assistant", "content": review})
603
+ messages_log = [messages]
604
+ return output_text, messages_log
605
+
606
+ return output_text
577
607
 
578
608
 
579
609
  class SentenceFrameExtractor(FrameExtractor):
@@ -657,7 +687,7 @@ class SentenceFrameExtractor(FrameExtractor):
657
687
 
658
688
 
659
689
  def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
660
- document_key:str=None, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
690
+ document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
661
691
  """
662
692
  This method inputs a text and outputs a list of outputs per sentence.
663
693
 
@@ -676,6 +706,8 @@ class SentenceFrameExtractor(FrameExtractor):
676
706
  the temperature for token sampling.
677
707
  stream : bool, Optional
678
708
  if True, LLM generated text will be printed in terminal in real-time.
709
+ return_messages_log : bool, Optional
710
+ if True, a list of messages will be returned.
679
711
 
680
712
  Return : str
681
713
  the output from LLM. Need post-processing.
@@ -690,6 +722,9 @@ class SentenceFrameExtractor(FrameExtractor):
690
722
  raise ValueError("document_key must be provided when text_content is dict.")
691
723
  sentences = self._get_sentences(text_content[document_key])
692
724
 
725
+ if return_messages_log:
726
+ messages_log = []
727
+
693
728
  # generate sentence by sentence
694
729
  for i, sent in enumerate(sentences):
695
730
  # construct chat messages
@@ -701,10 +736,20 @@ class SentenceFrameExtractor(FrameExtractor):
701
736
 
702
737
  if self.context_sentences == 0:
703
738
  # no context, just place sentence of interest
704
- messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
739
+ if isinstance(text_content, str):
740
+ messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
741
+ else:
742
+ sentence_content = text_content.copy()
743
+ sentence_content[document_key] = sent['sentence_text']
744
+ messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
705
745
  else:
706
746
  # insert context
707
- messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
747
+ if isinstance(text_content, str):
748
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
749
+ else:
750
+ context_content = text_content.copy()
751
+ context_content[document_key] = context
752
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
708
753
  # simulate conversation
709
754
  messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
710
755
  # place sentence of interest
@@ -724,6 +769,10 @@ class SentenceFrameExtractor(FrameExtractor):
724
769
  stream=stream,
725
770
  **kwrs
726
771
  )
772
+
773
+ if return_messages_log:
774
+ messages.append({"role": "assistant", "content": gen_text})
775
+ messages_log.append(messages)
727
776
 
728
777
  # add to output
729
778
  output.append({'sentence_start': sent['start'],
@@ -731,11 +780,15 @@ class SentenceFrameExtractor(FrameExtractor):
731
780
  'sentence_text': sent['sentence_text'],
732
781
  'gen_text': gen_text})
733
782
 
783
+ if return_messages_log:
784
+ return output, messages_log
785
+
734
786
  return output
735
787
 
736
788
 
737
789
  async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
738
- document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32, **kwrs) -> List[Dict[str,str]]:
790
+ document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32,
791
+ return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
739
792
  """
740
793
  The asynchronous version of the extract() method.
741
794
 
@@ -754,6 +807,11 @@ class SentenceFrameExtractor(FrameExtractor):
754
807
  the temperature for token sampling.
755
808
  concurrent_batch_size : int, Optional
756
809
  the number of sentences to process in concurrent.
810
+ return_messages_log : bool, Optional
811
+ if True, a list of messages will be returned.
812
+
813
+ Return : str
814
+ the output from LLM. Need post-processing.
757
815
  """
758
816
  # Check if self.inference_engine.chat_async() is implemented
759
817
  if not hasattr(self.inference_engine, 'chat_async'):
@@ -769,10 +827,14 @@ class SentenceFrameExtractor(FrameExtractor):
769
827
  raise ValueError("document_key must be provided when text_content is dict.")
770
828
  sentences = self._get_sentences(text_content[document_key])
771
829
 
830
+ if return_messages_log:
831
+ messages_log = []
832
+
772
833
  # generate sentence by sentence
773
834
  for i in range(0, len(sentences), concurrent_batch_size):
774
835
  tasks = []
775
836
  batch = sentences[i:i + concurrent_batch_size]
837
+ batch_messages = []
776
838
  for j, sent in enumerate(batch):
777
839
  # construct chat messages
778
840
  messages = []
@@ -783,10 +845,20 @@ class SentenceFrameExtractor(FrameExtractor):
783
845
 
784
846
  if self.context_sentences == 0:
785
847
  # no context, just place sentence of interest
786
- messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
848
+ if isinstance(text_content, str):
849
+ messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
850
+ else:
851
+ sentence_content = text_content.copy()
852
+ sentence_content[document_key] = sent['sentence_text']
853
+ messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
787
854
  else:
788
855
  # insert context
789
- messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
856
+ if isinstance(text_content, str):
857
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
858
+ else:
859
+ context_content = text_content.copy()
860
+ context_content[document_key] = context
861
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
790
862
  # simulate conversation
791
863
  messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
792
864
  # place sentence of interest
@@ -802,16 +874,25 @@ class SentenceFrameExtractor(FrameExtractor):
802
874
  )
803
875
  )
804
876
  tasks.append(task)
877
+ batch_messages.append(messages)
805
878
 
806
879
  # Wait until the batch is done, collect results and move on to next batch
807
880
  responses = await asyncio.gather(*tasks)
808
881
 
809
882
  # Collect outputs
810
- for gen_text, sent in zip(responses, batch):
883
+ for gen_text, sent, messages in zip(responses, batch, batch_messages):
884
+ if return_messages_log:
885
+ messages.append({"role": "assistant", "content": gen_text})
886
+ messages_log.append(messages)
887
+
811
888
  output.append({'sentence_start': sent['start'],
812
889
  'sentence_end': sent['end'],
813
890
  'sentence_text': sent['sentence_text'],
814
891
  'gen_text': gen_text})
892
+
893
+ if return_messages_log:
894
+ return output, messages_log
895
+
815
896
  return output
816
897
 
817
898
 
@@ -819,7 +900,7 @@ class SentenceFrameExtractor(FrameExtractor):
819
900
  document_key:str=None, temperature:float=0.0, stream:bool=False,
820
901
  concurrent:bool=False, concurrent_batch_size:int=32,
821
902
  case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
822
- allow_overlap_entities:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
903
+ allow_overlap_entities:bool=False, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
823
904
  """
824
905
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
825
906
  It use the extract() method and post-process outputs into frames.
@@ -857,6 +938,8 @@ class SentenceFrameExtractor(FrameExtractor):
857
938
  allow_overlap_entities : bool, Optional
858
939
  if True, entities can overlap in the text.
859
940
  Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
941
+ return_messages_log : bool, Optional
942
+ if True, a list of messages will be returned.
860
943
 
861
944
  Return : str
862
945
  a list of frames.
@@ -866,20 +949,25 @@ class SentenceFrameExtractor(FrameExtractor):
866
949
  warnings.warn("stream=True is not supported in concurrent mode.", RuntimeWarning)
867
950
 
868
951
  nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
869
- llm_output_sentences = asyncio.run(self.extract_async(text_content=text_content,
870
- max_new_tokens=max_new_tokens,
871
- document_key=document_key,
872
- temperature=temperature,
873
- concurrent_batch_size=concurrent_batch_size,
874
- **kwrs)
875
- )
952
+ extraction_results = asyncio.run(self.extract_async(text_content=text_content,
953
+ max_new_tokens=max_new_tokens,
954
+ document_key=document_key,
955
+ temperature=temperature,
956
+ concurrent_batch_size=concurrent_batch_size,
957
+ return_messages_log=return_messages_log,
958
+ **kwrs)
959
+ )
876
960
  else:
877
- llm_output_sentences = self.extract(text_content=text_content,
961
+ extraction_results = self.extract(text_content=text_content,
878
962
  max_new_tokens=max_new_tokens,
879
963
  document_key=document_key,
880
964
  temperature=temperature,
881
965
  stream=stream,
966
+ return_messages_log=return_messages_log,
882
967
  **kwrs)
968
+
969
+ llm_output_sentences, messages_log = extraction_results if return_messages_log else (extraction_results, None)
970
+
883
971
  frame_list = []
884
972
  for sent in llm_output_sentences:
885
973
  entity_json = []
@@ -908,6 +996,9 @@ class SentenceFrameExtractor(FrameExtractor):
908
996
  entity_text=entity_text,
909
997
  attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
910
998
  frame_list.append(frame)
999
+
1000
+ if return_messages_log:
1001
+ return frame_list, messages_log
911
1002
  return frame_list
912
1003
 
913
1004
 
@@ -963,7 +1054,7 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
963
1054
 
964
1055
 
965
1056
  def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
966
- document_key:str=None, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
1057
+ document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
967
1058
  """
968
1059
  This method inputs a text and outputs a list of outputs per sentence.
969
1060
 
@@ -982,6 +1073,8 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
982
1073
  the temperature for token sampling.
983
1074
  stream : bool, Optional
984
1075
  if True, LLM generated text will be printed in terminal in real-time.
1076
+ return_messages_log : bool, Optional
1077
+ if True, a list of messages will be returned.
985
1078
 
986
1079
  Return : str
987
1080
  the output from LLM. Need post-processing.
@@ -996,6 +1089,9 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
996
1089
  raise ValueError("document_key must be provided when text_content is dict.")
997
1090
  sentences = self._get_sentences(text_content[document_key])
998
1091
 
1092
+ if return_messages_log:
1093
+ messages_log = []
1094
+
999
1095
  # generate sentence by sentence
1000
1096
  for i, sent in enumerate(sentences):
1001
1097
  # construct chat messages
@@ -1007,10 +1103,20 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1007
1103
 
1008
1104
  if self.context_sentences == 0:
1009
1105
  # no context, just place sentence of interest
1010
- messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
1106
+ if isinstance(text_content, str):
1107
+ messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
1108
+ else:
1109
+ sentence_content = text_content.copy()
1110
+ sentence_content[document_key] = sent['sentence_text']
1111
+ messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
1011
1112
  else:
1012
1113
  # insert context
1013
- messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1114
+ if isinstance(text_content, str):
1115
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1116
+ else:
1117
+ context_content = text_content.copy()
1118
+ context_content[document_key] = context
1119
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
1014
1120
  # simulate conversation
1015
1121
  messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
1016
1122
  # place sentence of interest
@@ -1033,6 +1139,7 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1033
1139
  # Review
1034
1140
  if stream:
1035
1141
  print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
1142
+
1036
1143
  messages.append({'role': 'assistant', 'content': initial})
1037
1144
  messages.append({'role': 'user', 'content': self.review_prompt})
1038
1145
 
@@ -1050,15 +1157,23 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1050
1157
  elif self.review_mode == "addition":
1051
1158
  gen_text = initial + '\n' + review
1052
1159
 
1160
+ if return_messages_log:
1161
+ messages.append({"role": "assistant", "content": review})
1162
+ messages_log.append(messages)
1163
+
1053
1164
  # add to output
1054
1165
  output.append({'sentence_start': sent['start'],
1055
1166
  'sentence_end': sent['end'],
1056
1167
  'sentence_text': sent['sentence_text'],
1057
1168
  'gen_text': gen_text})
1169
+
1170
+ if return_messages_log:
1171
+ return output, messages_log
1172
+
1058
1173
  return output
1059
1174
 
1060
1175
  async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
1061
- document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32, **kwrs) -> List[Dict[str,str]]:
1176
+ document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
1062
1177
  """
1063
1178
  The asynchronous version of the extract() method.
1064
1179
 
@@ -1077,6 +1192,8 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1077
1192
  the temperature for token sampling.
1078
1193
  concurrent_batch_size : int, Optional
1079
1194
  the number of sentences to process in concurrent.
1195
+ return_messages_log : bool, Optional
1196
+ if True, a list of messages will be returned.
1080
1197
 
1081
1198
  Return : str
1082
1199
  the output from LLM. Need post-processing.
@@ -1095,6 +1212,9 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1095
1212
  raise ValueError("document_key must be provided when text_content is dict.")
1096
1213
  sentences = self._get_sentences(text_content[document_key])
1097
1214
 
1215
+ if return_messages_log:
1216
+ messages_log = []
1217
+
1098
1218
  # generate initial outputs sentence by sentence
1099
1219
  for i in range(0, len(sentences), concurrent_batch_size):
1100
1220
  messages_list = []
@@ -1111,10 +1231,20 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1111
1231
 
1112
1232
  if self.context_sentences == 0:
1113
1233
  # no context, just place sentence of interest
1114
- messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
1234
+ if isinstance(text_content, str):
1235
+ messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
1236
+ else:
1237
+ sentence_content = text_content.copy()
1238
+ sentence_content[document_key] = sent['sentence_text']
1239
+ messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
1115
1240
  else:
1116
1241
  # insert context
1117
- messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1242
+ if isinstance(text_content, str):
1243
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1244
+ else:
1245
+ context_content = text_content.copy()
1246
+ context_content[document_key] = context
1247
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
1118
1248
  # simulate conversation
1119
1249
  messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
1120
1250
  # place sentence of interest
@@ -1175,11 +1305,19 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1175
1305
  elif self.review_mode == "addition":
1176
1306
  gen_text = init['gen_text'] + '\n' + rev['gen_text']
1177
1307
 
1308
+ if return_messages_log:
1309
+ messages = init["messages"]
1310
+ messages.append({"role": "assistant", "content": rev['gen_text']})
1311
+ messages_log.append(messages)
1312
+
1178
1313
  # add to output
1179
1314
  output.append({'sentence_start': init['sentence_start'],
1180
1315
  'sentence_end': init['sentence_end'],
1181
1316
  'sentence_text': init['sentence_text'],
1182
1317
  'gen_text': gen_text})
1318
+
1319
+ if return_messages_log:
1320
+ return output, messages_log
1183
1321
  return output
1184
1322
 
1185
1323
 
@@ -1221,7 +1359,7 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
1221
1359
 
1222
1360
 
1223
1361
  def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
1224
- document_key:str=None, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
1362
+ document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
1225
1363
  """
1226
1364
  This method inputs a text and outputs a list of outputs per sentence.
1227
1365
 
@@ -1240,6 +1378,8 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
1240
1378
  the temperature for token sampling.
1241
1379
  stream : bool, Optional
1242
1380
  if True, LLM generated text will be printed in terminal in real-time.
1381
+ return_messages_log : bool, Optional
1382
+ if True, a list of messages will be returned.
1243
1383
 
1244
1384
  Return : str
1245
1385
  the output from LLM. Need post-processing.
@@ -1252,6 +1392,9 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
1252
1392
  elif isinstance(text_content, dict):
1253
1393
  sentences = self._get_sentences(text_content[document_key])
1254
1394
 
1395
+ if return_messages_log:
1396
+ messages_log = []
1397
+
1255
1398
  # generate sentence by sentence
1256
1399
  for i, sent in enumerate(sentences):
1257
1400
  # construct chat messages
@@ -1263,10 +1406,20 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
1263
1406
 
1264
1407
  if self.context_sentences == 0:
1265
1408
  # no context, just place sentence of interest
1266
- messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
1409
+ if isinstance(text_content, str):
1410
+ messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
1411
+ else:
1412
+ sentence_content = text_content.copy()
1413
+ sentence_content[document_key] = sent['sentence_text']
1414
+ messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
1267
1415
  else:
1268
1416
  # insert context
1269
- messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1417
+ if isinstance(text_content, str):
1418
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1419
+ else:
1420
+ context_content = text_content.copy()
1421
+ context_content[document_key] = context
1422
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
1270
1423
  # simulate conversation
1271
1424
  messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
1272
1425
  # place sentence of interest
@@ -1286,11 +1439,18 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
1286
1439
  **kwrs
1287
1440
  )
1288
1441
 
1442
+ if return_messages_log:
1443
+ messages.append({"role": "assistant", "content": gen_text})
1444
+ messages_log.append(messages)
1445
+
1289
1446
  # add to output
1290
1447
  output.append({'sentence_start': sent['start'],
1291
1448
  'sentence_end': sent['end'],
1292
1449
  'sentence_text': sent['sentence_text'],
1293
1450
  'gen_text': gen_text})
1451
+
1452
+ if return_messages_log:
1453
+ return output, messages_log
1294
1454
  return output
1295
1455
 
1296
1456
 
@@ -1361,7 +1521,7 @@ class RelationExtractor(Extractor):
1361
1521
 
1362
1522
  @abc.abstractmethod
1363
1523
  def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
1364
- temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict]:
1524
+ temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
1365
1525
  """
1366
1526
  This method considers all combinations of two frames.
1367
1527
 
@@ -1377,6 +1537,8 @@ class RelationExtractor(Extractor):
1377
1537
  the temperature for token sampling.
1378
1538
  stream : bool, Optional
1379
1539
  if True, LLM generated text will be printed in terminal in real-time.
1540
+ return_messages_log : bool, Optional
1541
+ if True, a list of messages will be returned.
1380
1542
 
1381
1543
  Return : List[Dict]
1382
1544
  a list of dict with {"frame_1", "frame_2"} for all relations.
@@ -1446,7 +1608,7 @@ class BinaryRelationExtractor(RelationExtractor):
1446
1608
 
1447
1609
 
1448
1610
  def extract(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
1449
- temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict]:
1611
+ temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
1450
1612
  """
1451
1613
  This method considers all combinations of two frames. Use the possible_relation_func to filter impossible pairs.
1452
1614
  Outputs pairs that are related.
@@ -1463,11 +1625,17 @@ class BinaryRelationExtractor(RelationExtractor):
1463
1625
  the temperature for token sampling.
1464
1626
  stream : bool, Optional
1465
1627
  if True, LLM generated text will be printed in terminal in real-time.
1628
+ return_messages_log : bool, Optional
1629
+ if True, a list of messages will be returned.
1466
1630
 
1467
1631
  Return : List[Dict]
1468
1632
  a list of dict with {"frame_1_id", "frame_2_id"}.
1469
1633
  """
1470
1634
  pairs = itertools.combinations(doc.frames, 2)
1635
+
1636
+ if return_messages_log:
1637
+ messages_log = []
1638
+
1471
1639
  output = []
1472
1640
  for frame_1, frame_2 in pairs:
1473
1641
  pos_rel = self.possible_relation_func(frame_1, frame_2)
@@ -1495,13 +1663,19 @@ class BinaryRelationExtractor(RelationExtractor):
1495
1663
  )
1496
1664
  rel_json = self._extract_json(gen_text)
1497
1665
  if self._post_process(rel_json):
1498
- output.append({'frame_1':frame_1.frame_id, 'frame_2':frame_2.frame_id})
1666
+ output.append({'frame_1_id':frame_1.frame_id, 'frame_2_id':frame_2.frame_id})
1667
+
1668
+ if return_messages_log:
1669
+ messages.append({"role": "assistant", "content": gen_text})
1670
+ messages_log.append(messages)
1499
1671
 
1672
+ if return_messages_log:
1673
+ return output, messages_log
1500
1674
  return output
1501
1675
 
1502
1676
 
1503
1677
  async def extract_async(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
1504
- temperature:float=0.0, concurrent_batch_size:int=32, **kwrs) -> List[Dict]:
1678
+ temperature:float=0.0, concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[Dict]:
1505
1679
  """
1506
1680
  This is the asynchronous version of the extract() method.
1507
1681
 
@@ -1517,6 +1691,8 @@ class BinaryRelationExtractor(RelationExtractor):
1517
1691
  the temperature for token sampling.
1518
1692
  concurrent_batch_size : int, Optional
1519
1693
  the number of frame pairs to process in concurrent.
1694
+ return_messages_log : bool, Optional
1695
+ if True, a list of messages will be returned.
1520
1696
 
1521
1697
  Return : List[Dict]
1522
1698
  a list of dict with {"frame_1", "frame_2"}.
@@ -1526,12 +1702,17 @@ class BinaryRelationExtractor(RelationExtractor):
1526
1702
  raise NotImplementedError(f"{self.inference_engine.__class__.__name__} does not have chat_async() method.")
1527
1703
 
1528
1704
  pairs = itertools.combinations(doc.frames, 2)
1705
+ if return_messages_log:
1706
+ messages_log = []
1707
+
1529
1708
  n_frames = len(doc.frames)
1530
1709
  num_pairs = (n_frames * (n_frames-1)) // 2
1531
- rel_pair_list = []
1532
- tasks = []
1710
+ output = []
1533
1711
  for i in range(0, num_pairs, concurrent_batch_size):
1712
+ rel_pair_list = []
1713
+ tasks = []
1534
1714
  batch = list(itertools.islice(pairs, concurrent_batch_size))
1715
+ batch_messages = []
1535
1716
  for frame_1, frame_2 in batch:
1536
1717
  pos_rel = self.possible_relation_func(frame_1, frame_2)
1537
1718
 
@@ -1546,6 +1727,7 @@ class BinaryRelationExtractor(RelationExtractor):
1546
1727
  "frame_1": str(frame_1.to_dict()),
1547
1728
  "frame_2": str(frame_2.to_dict())}
1548
1729
  )})
1730
+
1549
1731
  task = asyncio.create_task(
1550
1732
  self.inference_engine.chat_async(
1551
1733
  messages=messages,
@@ -1555,20 +1737,27 @@ class BinaryRelationExtractor(RelationExtractor):
1555
1737
  )
1556
1738
  )
1557
1739
  tasks.append(task)
1740
+ batch_messages.append(messages)
1558
1741
 
1559
1742
  responses = await asyncio.gather(*tasks)
1560
1743
 
1561
- output = []
1562
- for d, response in zip(rel_pair_list, responses):
1563
- rel_json = self._extract_json(response)
1564
- if self._post_process(rel_json):
1565
- output.append(d)
1744
+ for d, response, messages in zip(rel_pair_list, responses, batch_messages):
1745
+ if return_messages_log:
1746
+ messages.append({"role": "assistant", "content": response})
1747
+ messages_log.append(messages)
1566
1748
 
1749
+ rel_json = self._extract_json(response)
1750
+ if self._post_process(rel_json):
1751
+ output.append(d)
1752
+
1753
+ if return_messages_log:
1754
+ return output, messages_log
1567
1755
  return output
1568
1756
 
1569
1757
 
1570
1758
  def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
1571
- temperature:float=0.0, concurrent:bool=False, concurrent_batch_size:int=32, stream:bool=False, **kwrs) -> List[Dict]:
1759
+ temperature:float=0.0, concurrent:bool=False, concurrent_batch_size:int=32,
1760
+ stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
1572
1761
  """
1573
1762
  This method considers all combinations of two frames. Use the possible_relation_func to filter impossible pairs.
1574
1763
 
@@ -1588,6 +1777,8 @@ class BinaryRelationExtractor(RelationExtractor):
1588
1777
  the number of frame pairs to process in concurrent.
1589
1778
  stream : bool, Optional
1590
1779
  if True, LLM generated text will be printed in terminal in real-time.
1780
+ return_messages_log : bool, Optional
1781
+ if True, a list of messages will be returned.
1591
1782
 
1592
1783
  Return : List[Dict]
1593
1784
  a list of dict with {"frame_1", "frame_2"} for all relations.
@@ -1608,6 +1799,7 @@ class BinaryRelationExtractor(RelationExtractor):
1608
1799
  max_new_tokens=max_new_tokens,
1609
1800
  temperature=temperature,
1610
1801
  concurrent_batch_size=concurrent_batch_size,
1802
+ return_messages_log=return_messages_log,
1611
1803
  **kwrs)
1612
1804
  )
1613
1805
  else:
@@ -1616,6 +1808,7 @@ class BinaryRelationExtractor(RelationExtractor):
1616
1808
  max_new_tokens=max_new_tokens,
1617
1809
  temperature=temperature,
1618
1810
  stream=stream,
1811
+ return_messages_log=return_messages_log,
1619
1812
  **kwrs)
1620
1813
 
1621
1814
 
@@ -1689,7 +1882,7 @@ class MultiClassRelationExtractor(RelationExtractor):
1689
1882
 
1690
1883
 
1691
1884
  def extract(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
1692
- temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict]:
1885
+ temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
1693
1886
  """
1694
1887
  This method considers all combinations of two frames. Use the possible_relation_types_func to filter impossible pairs.
1695
1888
 
@@ -1705,11 +1898,17 @@ class MultiClassRelationExtractor(RelationExtractor):
1705
1898
  the temperature for token sampling.
1706
1899
  stream : bool, Optional
1707
1900
  if True, LLM generated text will be printed in terminal in real-time.
1901
+ return_messages_log : bool, Optional
1902
+ if True, a list of messages will be returned.
1708
1903
 
1709
1904
  Return : List[Dict]
1710
- a list of dict with {"frame_1", "frame_2", "relation"} for all frame pairs.
1905
+ a list of dict with {"frame_1_id", "frame_2_id", "relation"} for all frame pairs.
1711
1906
  """
1712
1907
  pairs = itertools.combinations(doc.frames, 2)
1908
+
1909
+ if return_messages_log:
1910
+ messages_log = []
1911
+
1713
1912
  output = []
1714
1913
  for frame_1, frame_2 in pairs:
1715
1914
  pos_rel_types = self.possible_relation_types_func(frame_1, frame_2)
@@ -1736,16 +1935,23 @@ class MultiClassRelationExtractor(RelationExtractor):
1736
1935
  stream=stream,
1737
1936
  **kwrs
1738
1937
  )
1938
+
1939
+ if return_messages_log:
1940
+ messages.append({"role": "assistant", "content": gen_text})
1941
+ messages_log.append(messages)
1942
+
1739
1943
  rel_json = self._extract_json(gen_text)
1740
1944
  rel = self._post_process(rel_json, pos_rel_types)
1741
1945
  if rel:
1742
- output.append({'frame_1':frame_1.frame_id, 'frame_2':frame_2.frame_id, 'relation':rel})
1946
+ output.append({'frame_1_id':frame_1.frame_id, 'frame_2_id':frame_2.frame_id, 'relation':rel})
1743
1947
 
1948
+ if return_messages_log:
1949
+ return output, messages_log
1744
1950
  return output
1745
1951
 
1746
1952
 
1747
1953
  async def extract_async(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
1748
- temperature:float=0.0, concurrent_batch_size:int=32, **kwrs) -> List[Dict]:
1954
+ temperature:float=0.0, concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[Dict]:
1749
1955
  """
1750
1956
  This is the asynchronous version of the extract() method.
1751
1957
 
@@ -1761,21 +1967,28 @@ class MultiClassRelationExtractor(RelationExtractor):
1761
1967
  the temperature for token sampling.
1762
1968
  concurrent_batch_size : int, Optional
1763
1969
  the number of frame pairs to process in concurrent.
1970
+ return_messages_log : bool, Optional
1971
+ if True, a list of messages will be returned.
1764
1972
 
1765
1973
  Return : List[Dict]
1766
- a list of dict with {"frame_1", "frame_2", "relation"} for all frame pairs.
1974
+ a list of dict with {"frame_1_id", "frame_2_id", "relation"} for all frame pairs.
1767
1975
  """
1768
1976
  # Check if self.inference_engine.chat_async() is implemented
1769
1977
  if not hasattr(self.inference_engine, 'chat_async'):
1770
1978
  raise NotImplementedError(f"{self.inference_engine.__class__.__name__} does not have chat_async() method.")
1771
1979
 
1772
1980
  pairs = itertools.combinations(doc.frames, 2)
1981
+ if return_messages_log:
1982
+ messages_log = []
1983
+
1773
1984
  n_frames = len(doc.frames)
1774
1985
  num_pairs = (n_frames * (n_frames-1)) // 2
1775
- rel_pair_list = []
1776
- tasks = []
1986
+ output = []
1777
1987
  for i in range(0, num_pairs, concurrent_batch_size):
1988
+ rel_pair_list = []
1989
+ tasks = []
1778
1990
  batch = list(itertools.islice(pairs, concurrent_batch_size))
1991
+ batch_messages = []
1779
1992
  for frame_1, frame_2 in batch:
1780
1993
  pos_rel_types = self.possible_relation_types_func(frame_1, frame_2)
1781
1994
 
@@ -1800,21 +2013,28 @@ class MultiClassRelationExtractor(RelationExtractor):
1800
2013
  )
1801
2014
  )
1802
2015
  tasks.append(task)
2016
+ batch_messages.append(messages)
1803
2017
 
1804
2018
  responses = await asyncio.gather(*tasks)
1805
2019
 
1806
- output = []
1807
- for d, response in zip(rel_pair_list, responses):
1808
- rel_json = self._extract_json(response)
1809
- rel = self._post_process(rel_json, d['pos_rel_types'])
1810
- if rel:
1811
- output.append({'frame_1':d['frame_1'], 'frame_2':d['frame_2'], 'relation':rel})
2020
+ for d, response, messages in zip(rel_pair_list, responses, batch_messages):
2021
+ if return_messages_log:
2022
+ messages.append({"role": "assistant", "content": response})
2023
+ messages_log.append(messages)
2024
+
2025
+ rel_json = self._extract_json(response)
2026
+ rel = self._post_process(rel_json, d['pos_rel_types'])
2027
+ if rel:
2028
+ output.append({'frame_1_id':d['frame_1'], 'frame_2_id':d['frame_2'], 'relation':rel})
1812
2029
 
2030
+ if return_messages_log:
2031
+ return output, messages_log
1813
2032
  return output
1814
2033
 
1815
2034
 
1816
2035
  def extract_relations(self, doc:LLMInformationExtractionDocument, buffer_size:int=100, max_new_tokens:int=128,
1817
- temperature:float=0.0, concurrent:bool=False, concurrent_batch_size:int=32, stream:bool=False, **kwrs) -> List[Dict]:
2036
+ temperature:float=0.0, concurrent:bool=False, concurrent_batch_size:int=32,
2037
+ stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict]:
1818
2038
  """
1819
2039
  This method considers all combinations of two frames. Use the possible_relation_types_func to filter impossible pairs.
1820
2040
 
@@ -1834,6 +2054,8 @@ class MultiClassRelationExtractor(RelationExtractor):
1834
2054
  the number of frame pairs to process in concurrent.
1835
2055
  stream : bool, Optional
1836
2056
  if True, LLM generated text will be printed in terminal in real-time.
2057
+ return_messages_log : bool, Optional
2058
+ if True, a list of messages will be returned.
1837
2059
 
1838
2060
  Return : List[Dict]
1839
2061
  a list of dict with {"frame_1", "frame_2", "relation"} for all relations.
@@ -1854,6 +2076,7 @@ class MultiClassRelationExtractor(RelationExtractor):
1854
2076
  max_new_tokens=max_new_tokens,
1855
2077
  temperature=temperature,
1856
2078
  concurrent_batch_size=concurrent_batch_size,
2079
+ return_messages_log=return_messages_log,
1857
2080
  **kwrs)
1858
2081
  )
1859
2082
  else:
@@ -1862,5 +2085,6 @@ class MultiClassRelationExtractor(RelationExtractor):
1862
2085
  max_new_tokens=max_new_tokens,
1863
2086
  temperature=temperature,
1864
2087
  stream=stream,
2088
+ return_messages_log=return_messages_log,
1865
2089
  **kwrs)
1866
2090
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 0.4.6
3
+ Version: 0.4.7
4
4
  Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -1206,10 +1206,14 @@ We benchmarked the frame and relation extractors on biomedical information extra
1206
1206
  ## Citation
1207
1207
  For more information and benchmarks, please check our paper:
1208
1208
  ```bibtex
1209
- @article{hsu2024llm,
1210
- title={LLM-IE: A Python Package for Generative Information Extraction with Large Language Models},
1209
+ @article{hsu2025llm,
1210
+ title={LLM-IE: a python package for biomedical generative information extraction with large language models},
1211
1211
  author={Hsu, Enshuo and Roberts, Kirk},
1212
- journal={arXiv preprint arXiv:2411.11779},
1213
- year={2024}
1212
+ journal={JAMIA open},
1213
+ volume={8},
1214
+ number={2},
1215
+ pages={ooaf012},
1216
+ year={2025},
1217
+ publisher={Oxford University Press}
1214
1218
  }
1215
1219
  ```
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=YomMhiA
16
16
  llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=YomMhiA6BoBIJFOZ1jcoKRqY7_T3PGUsM0t0vd1IuZA,9325
17
17
  llm_ie/data_types.py,sha256=_Kt4Er1SMj1jg8U8TCXFJH_64prur-IbFngHKmZgWr8,15717
18
18
  llm_ie/engines.py,sha256=nWQzV7mcRCOkJ-U1iP-xrT9dVahVbj-nAjQci4XaRjY,22609
19
- llm_ie/extractors.py,sha256=xdg1aYN_My9N9HrzKuMniqFCzNCq7E3DKI7Ru9gawFs,89822
19
+ llm_ie/extractors.py,sha256=O4jt0SxiAk5qKcnIhXqNDGP2Uo16w8UHhsz0Eem_qLQ,100374
20
20
  llm_ie/prompt_editor.py,sha256=pHRbg_yFZdoV63r3pvf0TsLfgH2EVJvzUQEVDH1Hj0s,9570
21
- llm_ie-0.4.6.dist-info/METADATA,sha256=wYKoOc0fvKvEAOvHGnMGOGgaKP49rmDUnHhA8vKBrgg,55745
22
- llm_ie-0.4.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
- llm_ie-0.4.6.dist-info/RECORD,,
21
+ llm_ie-0.4.7.dist-info/METADATA,sha256=B5YmHVvdweau9ZU0ORYPFPeI6s6I7WTcqZ6kuu7Xaeg,55821
22
+ llm_ie-0.4.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
23
+ llm_ie-0.4.7.dist-info/RECORD,,
File without changes