llm-ie 0.4.7__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py CHANGED
@@ -8,10 +8,12 @@ import warnings
8
8
  import itertools
9
9
  import asyncio
10
10
  import nest_asyncio
11
- from typing import Set, List, Dict, Tuple, Union, Callable
12
- from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
11
+ from typing import Any, Set, List, Dict, Tuple, Union, Callable, Generator, Optional
12
+ from llm_ie.data_types import FrameExtractionUnit, FrameExtractionUnitResult, LLMInformationExtractionFrame, LLMInformationExtractionDocument
13
+ from llm_ie.chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker
14
+ from llm_ie.chunkers import ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
13
15
  from llm_ie.engines import InferenceEngine
14
- from colorama import Fore, Style
16
+ from colorama import Fore, Style
15
17
 
16
18
 
17
19
  class Extractor:
@@ -38,15 +40,46 @@ class Extractor:
38
40
  def get_prompt_guide(cls) -> str:
39
41
  """
40
42
  This method returns the pre-defined prompt guideline for the extractor from the package asset.
43
+ It searches for a guide specific to the current class first, if not found, it will search
44
+ for the guide in its ancestors by traversing the class's method resolution order (MRO).
41
45
  """
42
- # Check if the prompt guide is available
43
- file_path = importlib.resources.files('llm_ie.asset.prompt_guide').joinpath(f"{cls.__name__}_prompt_guide.txt")
44
- try:
45
- with open(file_path, 'r', encoding="utf-8") as f:
46
- return f.read()
47
- except FileNotFoundError:
48
- warnings.warn(f"Prompt guide for {cls.__name__} is not available. Is it a customed extractor?", UserWarning)
49
- return None
46
+ original_class_name = cls.__name__
47
+
48
+ for current_class_in_mro in cls.__mro__:
49
+ if current_class_in_mro is object:
50
+ continue
51
+
52
+ current_class_name = current_class_in_mro.__name__
53
+
54
+ try:
55
+ file_path_obj = importlib.resources.files('llm_ie.asset.prompt_guide').joinpath(f"{current_class_name}_prompt_guide.txt")
56
+
57
+ with open(file_path_obj, 'r', encoding="utf-8") as f:
58
+ prompt_content = f.read()
59
+ # If the guide was found for an ancestor, not the original class, issue a warning.
60
+ if cls is not current_class_in_mro:
61
+ warnings.warn(
62
+ f"Prompt guide for '{original_class_name}' not found. "
63
+ f"Using guide from ancestor: '{current_class_name}_prompt_guide.txt'.",
64
+ UserWarning
65
+ )
66
+ return prompt_content
67
+ except FileNotFoundError:
68
+ pass
69
+
70
+ except Exception as e:
71
+ warnings.warn(
72
+ f"Error attempting to read prompt guide for '{current_class_name}' "
73
+ f"from '{str(file_path_obj)}': {e}. Trying next in MRO.",
74
+ UserWarning
75
+ )
76
+ continue
77
+
78
+ # If the loop completes, no prompt guide was found for the original class or any of its ancestors.
79
+ raise FileNotFoundError(
80
+ f"Prompt guide for '{original_class_name}' not found in the package asset. "
81
+ f"Is it a custom extractor?"
82
+ )
50
83
 
51
84
  def _get_user_prompt(self, text_content:Union[str, Dict[str,str]]) -> str:
52
85
  """
@@ -138,7 +171,8 @@ class Extractor:
138
171
 
139
172
  class FrameExtractor(Extractor):
140
173
  from nltk.tokenize import RegexpTokenizer
141
- def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
174
+ def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
175
+ prompt_template:str, system_prompt:str=None, context_chunker:ContextChunker=None, **kwrs):
142
176
  """
143
177
  This is the abstract class for frame extraction.
144
178
  Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
@@ -147,15 +181,25 @@ class FrameExtractor(Extractor):
147
181
  ----------
148
182
  inference_engine : InferenceEngine
149
183
  the LLM inferencing engine object. Must implements the chat() method.
184
+ unit_chunker : UnitChunker
185
+ the unit chunker object that determines how to chunk the document text into units.
150
186
  prompt_template : str
151
187
  prompt template with "{{<placeholder name>}}" placeholder.
152
188
  system_prompt : str, Optional
153
189
  system prompt.
190
+ context_chunker : ContextChunker
191
+ the context chunker object that determines how to get context for each unit.
154
192
  """
155
193
  super().__init__(inference_engine=inference_engine,
156
194
  prompt_template=prompt_template,
157
195
  system_prompt=system_prompt,
158
196
  **kwrs)
197
+
198
+ self.unit_chunker = unit_chunker
199
+ if context_chunker is None:
200
+ self.context_chunker = NoContextChunker()
201
+ else:
202
+ self.context_chunker = context_chunker
159
203
 
160
204
  self.tokenizer = self.RegexpTokenizer(r'\w+|[^\w\s]')
161
205
 
@@ -338,32 +382,38 @@ class FrameExtractor(Extractor):
338
382
  return NotImplemented
339
383
 
340
384
 
341
- class BasicFrameExtractor(FrameExtractor):
342
- def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
385
+ class DirectFrameExtractor(FrameExtractor):
386
+ def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
387
+ prompt_template:str, system_prompt:str=None, context_chunker:ContextChunker=None, **kwrs):
343
388
  """
344
- This class diretly prompt LLM for frame extraction.
345
- Input system prompt (optional), prompt template (with instruction, few-shot examples),
346
- and specify a LLM.
389
+ This class is for general unit-context frame extraction.
390
+ Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
347
391
 
348
392
  Parameters:
349
393
  ----------
350
394
  inference_engine : InferenceEngine
351
395
  the LLM inferencing engine object. Must implements the chat() method.
396
+ unit_chunker : UnitChunker
397
+ the unit chunker object that determines how to chunk the document text into units.
352
398
  prompt_template : str
353
399
  prompt template with "{{<placeholder name>}}" placeholder.
354
400
  system_prompt : str, Optional
355
401
  system prompt.
402
+ context_chunker : ContextChunker
403
+ the context chunker object that determines how to get context for each unit.
356
404
  """
357
- super().__init__(inference_engine=inference_engine,
358
- prompt_template=prompt_template,
359
- system_prompt=system_prompt,
405
+ super().__init__(inference_engine=inference_engine,
406
+ unit_chunker=unit_chunker,
407
+ prompt_template=prompt_template,
408
+ system_prompt=system_prompt,
409
+ context_chunker=context_chunker,
360
410
  **kwrs)
361
-
411
+
362
412
 
363
413
  def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048,
364
- temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> str:
414
+ document_key:str=None, temperature:float=0.0, verbose:bool=False, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnitResult]:
365
415
  """
366
- This method inputs a text and outputs a string generated by LLM.
416
+ This method inputs a text and outputs a list of outputs per unit.
367
417
 
368
418
  Parameters:
369
419
  ----------
@@ -371,44 +421,349 @@ class BasicFrameExtractor(FrameExtractor):
371
421
  the input text content to put in prompt template.
372
422
  If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
373
423
  If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
374
- max_new_tokens : str, Optional
375
- the max number of new tokens LLM can generate.
424
+ max_new_tokens : int, Optional
425
+ the max number of new tokens LLM should generate.
426
+ document_key : str, Optional
427
+ specify the key in text_content where document text is.
428
+ If text_content is str, this parameter will be ignored.
376
429
  temperature : float, Optional
377
- the temperature for token sampling.
378
- stream : bool, Optional
430
+ the temperature for token sampling.
431
+ verbose : bool, Optional
379
432
  if True, LLM generated text will be printed in terminal in real-time.
380
433
  return_messages_log : bool, Optional
381
434
  if True, a list of messages will be returned.
382
435
 
383
- Return : str
384
- the output from LLM. Need post-processing.
436
+ Return : List[FrameExtractionUnitResult]
437
+ the output from LLM for each unit. Contains the start, end, text, and generated text.
438
+ """
439
+ # define output
440
+ output = []
441
+ # unit chunking
442
+ if isinstance(text_content, str):
443
+ doc_text = text_content
444
+
445
+ elif isinstance(text_content, dict):
446
+ if document_key is None:
447
+ raise ValueError("document_key must be provided when text_content is dict.")
448
+ doc_text = text_content[document_key]
449
+
450
+ units = self.unit_chunker.chunk(doc_text)
451
+ # context chunker init
452
+ self.context_chunker.fit(doc_text, units)
453
+ # messages log
454
+ if return_messages_log:
455
+ messages_log = []
456
+
457
+ # generate unit by unit
458
+ for i, unit in enumerate(units):
459
+ # construct chat messages
460
+ messages = []
461
+ if self.system_prompt:
462
+ messages.append({'role': 'system', 'content': self.system_prompt})
463
+
464
+ context = self.context_chunker.chunk(unit)
465
+
466
+ if context == "":
467
+ # no context, just place unit in user prompt
468
+ if isinstance(text_content, str):
469
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
470
+ else:
471
+ unit_content = text_content.copy()
472
+ unit_content[document_key] = unit.text
473
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
474
+ else:
475
+ # insert context to user prompt
476
+ if isinstance(text_content, str):
477
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
478
+ else:
479
+ context_content = text_content.copy()
480
+ context_content[document_key] = context
481
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
482
+ # simulate conversation where assistant confirms
483
+ messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
484
+ # place unit of interest
485
+ messages.append({'role': 'user', 'content': unit.text})
486
+
487
+ if verbose:
488
+ print(f"\n\n{Fore.GREEN}Unit {i}:{Style.RESET_ALL}\n{unit.text}\n")
489
+ if context != "":
490
+ print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
491
+
492
+ print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
493
+
494
+ response_stream = self.inference_engine.chat(
495
+ messages=messages,
496
+ max_new_tokens=max_new_tokens,
497
+ temperature=temperature,
498
+ stream=True,
499
+ **kwrs
500
+ )
501
+
502
+ gen_text = ""
503
+ for chunk in response_stream:
504
+ gen_text += chunk
505
+ print(chunk, end='', flush=True)
506
+
507
+ else:
508
+ gen_text = self.inference_engine.chat(
509
+ messages=messages,
510
+ max_new_tokens=max_new_tokens,
511
+ temperature=temperature,
512
+ stream=False,
513
+ **kwrs
514
+ )
515
+
516
+ if return_messages_log:
517
+ messages.append({"role": "assistant", "content": gen_text})
518
+ messages_log.append(messages)
519
+
520
+ # add to output
521
+ result = FrameExtractionUnitResult(
522
+ start=unit.start,
523
+ end=unit.end,
524
+ text=unit.text,
525
+ gen_text=gen_text)
526
+ output.append(result)
527
+
528
+ if return_messages_log:
529
+ return output, messages_log
530
+
531
+ return output
532
+
533
+ def stream(self, text_content: Union[str, Dict[str, str]], max_new_tokens: int = 2048, document_key: str = None,
534
+ temperature: float = 0.0, **kwrs) -> Generator[Dict[str, Any], None, List[FrameExtractionUnitResult]]:
535
+ """
536
+ Streams LLM responses per unit with structured event types,
537
+ and returns collected data for post-processing.
538
+
539
+ Yields:
540
+ -------
541
+ Dict[str, Any]: (type, data)
542
+ - {"type": "info", "data": str_message}: General informational messages.
543
+ - {"type": "unit", "data": dict_unit_info}: Signals start of a new unit. dict_unit_info contains {'id', 'text', 'start', 'end'}
544
+ - {"type": "context", "data": str_context}: Context string for the current unit.
545
+ - {"type": "llm_chunk", "data": str_chunk}: A raw chunk from the LLM.
546
+
547
+ Returns:
548
+ --------
549
+ List[FrameExtractionUnitResult]:
550
+ A list of FrameExtractionUnitResult objects, each containing the
551
+ original unit details and the fully accumulated 'gen_text' from the LLM.
552
+ """
553
+ collected_results: List[FrameExtractionUnitResult] = []
554
+
555
+ if isinstance(text_content, str):
556
+ doc_text = text_content
557
+ elif isinstance(text_content, dict):
558
+ if document_key is None:
559
+ raise ValueError("document_key must be provided when text_content is dict.")
560
+ if document_key not in text_content:
561
+ raise ValueError(f"document_key '{document_key}' not found in text_content.")
562
+ doc_text = text_content[document_key]
563
+ else:
564
+ raise TypeError("text_content must be a string or a dictionary.")
565
+
566
+ units: List[FrameExtractionUnit] = self.unit_chunker.chunk(doc_text)
567
+ self.context_chunker.fit(doc_text, units)
568
+
569
+ yield {"type": "info", "data": f"Starting LLM processing for {len(units)} units."}
570
+
571
+ for i, unit in enumerate(units):
572
+ unit_info_payload = {"id": i, "text": unit.text, "start": unit.start, "end": unit.end}
573
+ yield {"type": "unit", "data": unit_info_payload}
574
+
575
+ messages = []
576
+ if self.system_prompt:
577
+ messages.append({'role': 'system', 'content': self.system_prompt})
578
+
579
+ context_str = self.context_chunker.chunk(unit)
580
+
581
+ # Construct prompt input based on whether text_content was str or dict
582
+ if context_str:
583
+ yield {"type": "context", "data": context_str}
584
+ prompt_input_for_context = context_str
585
+ if isinstance(text_content, dict):
586
+ context_content_dict = text_content.copy()
587
+ context_content_dict[document_key] = context_str
588
+ prompt_input_for_context = context_content_dict
589
+ messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_context)})
590
+ messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
591
+ messages.append({'role': 'user', 'content': unit.text})
592
+ else: # No context
593
+ prompt_input_for_unit = unit.text
594
+ if isinstance(text_content, dict):
595
+ unit_content_dict = text_content.copy()
596
+ unit_content_dict[document_key] = unit.text
597
+ prompt_input_for_unit = unit_content_dict
598
+ messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_unit)})
599
+
600
+ current_gen_text = ""
601
+
602
+ response_stream = self.inference_engine.chat(
603
+ messages=messages,
604
+ max_new_tokens=max_new_tokens,
605
+ temperature=temperature,
606
+ stream=True,
607
+ **kwrs
608
+ )
609
+ for chunk in response_stream:
610
+ yield {"type": "llm_chunk", "data": chunk}
611
+ current_gen_text += chunk
612
+
613
+ # Store the result for this unit
614
+ result_for_unit = FrameExtractionUnitResult(
615
+ start=unit.start,
616
+ end=unit.end,
617
+ text=unit.text,
618
+ gen_text=current_gen_text
619
+ )
620
+ collected_results.append(result_for_unit)
621
+
622
+ yield {"type": "info", "data": "All units processed by LLM."}
623
+ return collected_results
624
+
625
+ async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, document_key:str=None, temperature:float=0.0,
626
+ concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnitResult]:
627
+ """
628
+ This is the asynchronous version of the extract() method.
629
+
630
+ Parameters:
631
+ ----------
632
+ text_content : Union[str, Dict[str,str]]
633
+ the input text content to put in prompt template.
634
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
635
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
636
+ max_new_tokens : int, Optional
637
+ the max number of new tokens LLM should generate.
638
+ document_key : str, Optional
639
+ specify the key in text_content where document text is.
640
+ If text_content is str, this parameter will be ignored.
641
+ temperature : float, Optional
642
+ the temperature for token sampling.
643
+ concurrent_batch_size : int, Optional
644
+ the batch size for concurrent processing.
645
+ return_messages_log : bool, Optional
646
+ if True, a list of messages will be returned.
647
+
648
+ Return : List[FrameExtractionUnitResult]
649
+ the output from LLM for each unit. Contains the start, end, text, and generated text.
385
650
  """
386
- messages = []
387
- if self.system_prompt:
388
- messages.append({'role': 'system', 'content': self.system_prompt})
651
+ if isinstance(text_content, str):
652
+ doc_text = text_content
653
+ elif isinstance(text_content, dict):
654
+ if document_key is None:
655
+ raise ValueError("document_key must be provided when text_content is dict.")
656
+ if document_key not in text_content:
657
+ raise ValueError(f"document_key '{document_key}' not found in text_content dictionary.")
658
+ doc_text = text_content[document_key]
659
+ else:
660
+ raise TypeError("text_content must be a string or a dictionary.")
661
+
662
+ units = self.unit_chunker.chunk(doc_text)
663
+
664
+ # context chunker init
665
+ self.context_chunker.fit(doc_text, units)
666
+
667
+ # Prepare inputs for all units first
668
+ tasks_input = []
669
+ for i, unit in enumerate(units):
670
+ # construct chat messages
671
+ messages = []
672
+ if self.system_prompt:
673
+ messages.append({'role': 'system', 'content': self.system_prompt})
674
+
675
+ context = self.context_chunker.chunk(unit)
676
+
677
+ if context == "":
678
+ # no context, just place unit in user prompt
679
+ if isinstance(text_content, str):
680
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
681
+ else:
682
+ unit_content = text_content.copy()
683
+ unit_content[document_key] = unit.text
684
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
685
+ else:
686
+ # insert context to user prompt
687
+ if isinstance(text_content, str):
688
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
689
+ else:
690
+ context_content = text_content.copy()
691
+ context_content[document_key] = context
692
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
693
+ # simulate conversation where assistant confirms
694
+ messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
695
+ # place unit of interest
696
+ messages.append({'role': 'user', 'content': unit.text})
697
+
698
+ # Store unit and messages together for the task
699
+ tasks_input.append({"unit": unit, "messages": messages, "original_index": i})
389
700
 
390
- messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
391
- response = self.inference_engine.chat(
701
+ # Process units concurrently with asyncio.Semaphore
702
+ semaphore = asyncio.Semaphore(concurrent_batch_size)
703
+
704
+ async def semaphore_helper(task_data: Dict, max_new_tokens: int, temperature: float, **kwrs):
705
+ unit = task_data["unit"]
706
+ messages = task_data["messages"]
707
+ original_index = task_data["original_index"]
708
+
709
+ async with semaphore:
710
+ gen_text = await self.inference_engine.chat_async(
392
711
  messages=messages,
393
- max_new_tokens=max_new_tokens,
712
+ max_new_tokens=max_new_tokens,
394
713
  temperature=temperature,
395
- stream=stream,
396
714
  **kwrs
397
715
  )
398
-
716
+ return {"original_index": original_index, "unit": unit, "gen_text": gen_text, "messages": messages}
717
+
718
+ # Create and gather tasks
719
+ tasks = []
720
+ for task_inp in tasks_input:
721
+ task = asyncio.create_task(semaphore_helper(
722
+ task_inp,
723
+ max_new_tokens=max_new_tokens,
724
+ temperature=temperature,
725
+ **kwrs
726
+ ))
727
+ tasks.append(task)
728
+
729
+ results_raw = await asyncio.gather(*tasks)
730
+
731
+ # Sort results back into original order using the index stored
732
+ results_raw.sort(key=lambda x: x["original_index"])
733
+
734
+ # Restructure the results
735
+ output: List[FrameExtractionUnitResult] = []
736
+ messages_log: Optional[List[List[Dict[str, str]]]] = [] if return_messages_log else None
737
+
738
+ for result_data in results_raw:
739
+ unit = result_data["unit"]
740
+ gen_text = result_data["gen_text"]
741
+
742
+ # Create result object
743
+ result = FrameExtractionUnitResult(
744
+ start=unit.start,
745
+ end=unit.end,
746
+ text=unit.text,
747
+ gen_text=gen_text
748
+ )
749
+ output.append(result)
750
+
751
+ # Append to messages log if requested
752
+ if return_messages_log:
753
+ final_messages = result_data["messages"] + [{"role": "assistant", "content": gen_text}]
754
+ messages_log.append(final_messages)
755
+
399
756
  if return_messages_log:
400
- messages.append({"role": "assistant", "content": response})
401
- messages_log = [messages]
402
- return response, messages_log
403
-
404
- return response
405
-
757
+ return output, messages_log
758
+ else:
759
+ return output
406
760
 
407
- def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
408
- temperature:float=0.0, document_key:str=None, stream:bool=False,
409
- case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
410
- fuzzy_score_cutoff:float=0.8, allow_overlap_entities:bool=False,
411
- return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
761
+
762
+ def extract_frames(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
763
+ document_key:str=None, temperature:float=0.0, verbose:bool=False,
764
+ concurrent:bool=False, concurrent_batch_size:int=32,
765
+ case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
766
+ allow_overlap_entities:bool=False, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
412
767
  """
413
768
  This method inputs a text and outputs a list of LLMInformationExtractionFrame
414
769
  It use the extract() method and post-process outputs into frames.
@@ -419,17 +774,19 @@ class BasicFrameExtractor(FrameExtractor):
419
774
  the input text content to put in prompt template.
420
775
  If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
421
776
  If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
422
- entity_key : str
423
- the key (in ouptut JSON) for entity text. Any extraction that does not include entity key will be dropped.
424
777
  max_new_tokens : str, Optional
425
778
  the max number of new tokens LLM should generate.
426
- temperature : float, Optional
427
- the temperature for token sampling.
428
779
  document_key : str, Optional
429
780
  specify the key in text_content where document text is.
430
781
  If text_content is str, this parameter will be ignored.
431
- stream : bool, Optional
432
- if True, LLM generated text will be printed in terminal in real-time.
782
+ temperature : float, Optional
783
+ the temperature for token sampling.
784
+ verbose : bool, Optional
785
+ if True, LLM generated text will be printed in terminal in real-time.
786
+ concurrent : bool, Optional
787
+ if True, the sentences will be extracted in concurrent.
788
+ concurrent_batch_size : int, Optional
789
+ the number of sentences to process in concurrent. Only used when `concurrent` is True.
433
790
  case_sensitive : bool, Optional
434
791
  if True, entity text matching will be case-sensitive.
435
792
  fuzzy_match : bool, Optional
@@ -448,58 +805,74 @@ class BasicFrameExtractor(FrameExtractor):
448
805
  Return : str
449
806
  a list of frames.
450
807
  """
451
- if isinstance(text_content, str):
452
- text = text_content
453
- elif isinstance(text_content, dict):
454
- if document_key is None:
455
- raise ValueError("document_key must be provided when text_content is dict.")
456
- text = text_content[document_key]
808
+ ENTITY_KEY = "entity_text"
809
+ if concurrent:
810
+ if verbose:
811
+ warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)
812
+
813
+ nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
814
+ extraction_results = asyncio.run(self.extract_async(text_content=text_content,
815
+ max_new_tokens=max_new_tokens,
816
+ document_key=document_key,
817
+ temperature=temperature,
818
+ concurrent_batch_size=concurrent_batch_size,
819
+ return_messages_log=return_messages_log,
820
+ **kwrs)
821
+ )
822
+ else:
823
+ extraction_results = self.extract(text_content=text_content,
824
+ max_new_tokens=max_new_tokens,
825
+ document_key=document_key,
826
+ temperature=temperature,
827
+ verbose=verbose,
828
+ return_messages_log=return_messages_log,
829
+ **kwrs)
830
+
831
+ llm_output_results, messages_log = extraction_results if return_messages_log else (extraction_results, None)
457
832
 
458
833
  frame_list = []
459
- extraction_results = self.extract(text_content=text_content,
460
- max_new_tokens=max_new_tokens,
461
- temperature=temperature,
462
- stream=stream,
463
- return_messages_log=return_messages_log,
464
- **kwrs)
465
- gen_text, messages_log = extraction_results if return_messages_log else (extraction_results, None)
466
-
467
- entity_json = []
468
- for entity in self._extract_json(gen_text=gen_text):
469
- if entity_key in entity:
470
- entity_json.append(entity)
471
- else:
472
- warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
473
-
474
- spans = self._find_entity_spans(text=text,
475
- entities=[e[entity_key] for e in entity_json],
476
- case_sensitive=case_sensitive,
477
- fuzzy_match=fuzzy_match,
478
- fuzzy_buffer_size=fuzzy_buffer_size,
479
- fuzzy_score_cutoff=fuzzy_score_cutoff,
480
- allow_overlap_entities=allow_overlap_entities)
481
-
482
- for i, (ent, span) in enumerate(zip(entity_json, spans)):
483
- if span is not None:
484
- start, end = span
485
- frame = LLMInformationExtractionFrame(frame_id=f"{i}",
486
- start=start,
487
- end=end,
488
- entity_text=text[start:end],
489
- attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
490
- frame_list.append(frame)
834
+ for res in llm_output_results:
835
+ entity_json = []
836
+ for entity in self._extract_json(gen_text=res.gen_text):
837
+ if ENTITY_KEY in entity:
838
+ entity_json.append(entity)
839
+ else:
840
+ warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
841
+
842
+ spans = self._find_entity_spans(text=res.text,
843
+ entities=[e[ENTITY_KEY] for e in entity_json],
844
+ case_sensitive=case_sensitive,
845
+ fuzzy_match=fuzzy_match,
846
+ fuzzy_buffer_size=fuzzy_buffer_size,
847
+ fuzzy_score_cutoff=fuzzy_score_cutoff,
848
+ allow_overlap_entities=allow_overlap_entities)
849
+ for ent, span in zip(entity_json, spans):
850
+ if span is not None:
851
+ start, end = span
852
+ entity_text = res.text[start:end]
853
+ start += res.start
854
+ end += res.start
855
+ attr = {}
856
+ if "attr" in ent and ent["attr"] is not None:
857
+ attr = ent["attr"]
858
+
859
+ frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
860
+ start=start,
861
+ end=end,
862
+ entity_text=entity_text,
863
+ attr=attr)
864
+ frame_list.append(frame)
491
865
 
492
866
  if return_messages_log:
493
867
  return frame_list, messages_log
494
-
495
868
  return frame_list
496
869
 
497
870
 
498
- class ReviewFrameExtractor(BasicFrameExtractor):
499
- def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
500
- review_mode:str, review_prompt:str=None,system_prompt:str=None, **kwrs):
871
+ class ReviewFrameExtractor(DirectFrameExtractor):
872
+ def __init__(self, unit_chunker:UnitChunker, context_chunker:ContextChunker,
873
+ inference_engine:InferenceEngine, prompt_template:str, review_mode:str, review_prompt:str=None, system_prompt:str=None, **kwrs):
501
874
  """
502
- This class add a review step after the BasicFrameExtractor.
875
+ This class add a review step after the DirectFrameExtractor.
503
876
  The Review process asks LLM to review its output and:
504
877
  1. add more frames while keep current. This is efficient for boosting recall.
505
878
  2. or, regenerate frames (add new and delete existing).
@@ -507,6 +880,10 @@ class ReviewFrameExtractor(BasicFrameExtractor):
507
880
 
508
881
  Parameters:
509
882
  ----------
883
+ unit_chunker : UnitChunker
884
+ the unit chunker object that determines how to chunk the document text into units.
885
+ context_chunker : ContextChunker
886
+ the context chunker object that determines how to get context for each unit.
510
887
  inference_engine : InferenceEngine
511
888
  the LLM inferencing engine object. Must implements the chat() method.
512
889
  prompt_template : str
@@ -520,27 +897,52 @@ class ReviewFrameExtractor(BasicFrameExtractor):
520
897
  system_prompt : str, Optional
521
898
  system prompt.
522
899
  """
523
- super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
524
- system_prompt=system_prompt, **kwrs)
900
+ super().__init__(inference_engine=inference_engine,
901
+ unit_chunker=unit_chunker,
902
+ prompt_template=prompt_template,
903
+ system_prompt=system_prompt,
904
+ context_chunker=context_chunker,
905
+ **kwrs)
906
+ # check review mode
525
907
  if review_mode not in {"addition", "revision"}:
526
908
  raise ValueError('review_mode must be one of {"addition", "revision"}.')
527
909
  self.review_mode = review_mode
528
-
910
+ # assign review prompt
529
911
  if review_prompt:
530
912
  self.review_prompt = review_prompt
531
913
  else:
532
- file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
533
- joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
534
- with open(file_path, 'r', encoding="utf-8") as f:
535
- self.review_prompt = f.read()
536
-
537
- warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
538
-
914
+ self.review_prompt = None
915
+ original_class_name = self.__class__.__name__
916
+
917
+ current_class_name = original_class_name
918
+ for current_class_in_mro in self.__class__.__mro__:
919
+ if current_class_in_mro is object:
920
+ continue
921
+
922
+ current_class_name = current_class_in_mro.__name__
923
+ try:
924
+ file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
925
+ joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
926
+ with open(file_path, 'r', encoding="utf-8") as f:
927
+ self.review_prompt = f.read()
928
+ except FileNotFoundError:
929
+ pass
930
+
931
+ except Exception as e:
932
+ warnings.warn(
933
+ f"Error attempting to read default review prompt for '{current_class_name}' "
934
+ f"from '{str(file_path)}': {e}. Trying next in MRO.",
935
+ UserWarning
936
+ )
937
+ continue
938
+
939
+ if self.review_prompt is None:
940
+ raise ValueError(f"Cannot find review prompt for {self.__class__.__name__} in the package. Please provide a review_prompt.")
539
941
 
540
- def extract(self, text_content:Union[str, Dict[str,str]],
541
- max_new_tokens:int=4096, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> str:
942
+ def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, document_key:str=None,
943
+ temperature:float=0.0, verbose:bool=False, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnitResult]:
542
944
  """
543
- This method inputs a text and outputs a string generated by LLM.
945
+ This method inputs a text and outputs a list of outputs per unit.
544
946
 
545
947
  Parameters:
546
948
  ----------
@@ -548,515 +950,161 @@ class ReviewFrameExtractor(BasicFrameExtractor):
548
950
  the input text content to put in prompt template.
549
951
  If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
550
952
  If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
551
- max_new_tokens : str, Optional
552
- the max number of new tokens LLM can generate.
953
+ max_new_tokens : int, Optional
954
+ the max number of new tokens LLM should generate.
955
+ document_key : str, Optional
956
+ specify the key in text_content where document text is.
957
+ If text_content is str, this parameter will be ignored.
553
958
  temperature : float, Optional
554
- the temperature for token sampling.
555
- stream : bool, Optional
959
+ the temperature for token sampling.
960
+ verbose : bool, Optional
556
961
  if True, LLM generated text will be printed in terminal in real-time.
557
962
  return_messages_log : bool, Optional
558
963
  if True, a list of messages will be returned.
559
964
 
560
- Return : str
561
- the output from LLM. Need post-processing.
965
+ Return : List[FrameExtractionUnitResult]
966
+ the output from LLM for each unit. Contains the start, end, text, and generated text.
562
967
  """
563
- messages = []
564
- if self.system_prompt:
565
- messages.append({'role': 'system', 'content': self.system_prompt})
566
-
567
- messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
568
- # Initial output
569
- if stream:
570
- print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
571
-
572
- initial = self.inference_engine.chat(
573
- messages=messages,
574
- max_new_tokens=max_new_tokens,
575
- temperature=temperature,
576
- stream=stream,
577
- **kwrs
578
- )
579
-
580
- # Review
581
- messages.append({'role': 'assistant', 'content': initial})
582
- messages.append({'role': 'user', 'content': self.review_prompt})
583
-
584
- if stream:
585
- print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
586
- review = self.inference_engine.chat(
587
- messages=messages,
588
- max_new_tokens=max_new_tokens,
589
- temperature=temperature,
590
- stream=stream,
591
- **kwrs
592
- )
593
-
594
- # Output
595
- output_text = ""
596
- if self.review_mode == "revision":
597
- output_text = review
598
- elif self.review_mode == "addition":
599
- output_text = initial + '\n' + review
968
+ # define output
969
+ output = []
970
+ # unit chunking
971
+ if isinstance(text_content, str):
972
+ doc_text = text_content
600
973
 
601
- if return_messages_log:
602
- messages.append({"role": "assistant", "content": review})
603
- messages_log = [messages]
604
- return output_text, messages_log
974
+ elif isinstance(text_content, dict):
975
+ if document_key is None:
976
+ raise ValueError("document_key must be provided when text_content is dict.")
977
+ doc_text = text_content[document_key]
605
978
 
606
- return output_text
607
-
979
+ units = self.unit_chunker.chunk(doc_text)
980
+ # context chunker init
981
+ self.context_chunker.fit(doc_text, units)
982
+ # messages log
983
+ if return_messages_log:
984
+ messages_log = []
608
985
 
609
- class SentenceFrameExtractor(FrameExtractor):
610
- from nltk.tokenize.punkt import PunktSentenceTokenizer
611
- def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
612
- context_sentences:Union[str, int]="all", **kwrs):
613
- """
614
- This class performs sentence-by-sentence information extraction.
615
- The process is as follows:
616
- 1. system prompt (optional)
617
- 2. user prompt with instructions (schema, background, full text, few-shot example...)
618
- 3. feed a sentence (start with first sentence)
619
- 4. LLM extract entities and attributes from the sentence
620
- 5. repeat #3 and #4
621
-
622
- Input system prompt (optional), prompt template (with user instructions),
623
- and specify a LLM.
624
-
625
- Parameters:
626
- ----------
627
- inference_engine : InferenceEngine
628
- the LLM inferencing engine object. Must implements the chat() method.
629
- prompt_template : str
630
- prompt template with "{{<placeholder name>}}" placeholder.
631
- system_prompt : str, Optional
632
- system prompt.
633
- context_sentences : Union[str, int], Optional
634
- number of sentences before and after the given sentence to provide additional context.
635
- if "all", the full text will be provided in the prompt as context.
636
- if 0, no additional context will be provided.
637
- This is good for tasks that does not require context beyond the given sentence.
638
- if > 0, the number of sentences before and after the given sentence to provide as context.
639
- This is good for tasks that require context beyond the given sentence.
640
- """
641
- super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
642
- system_prompt=system_prompt, **kwrs)
643
-
644
- if not isinstance(context_sentences, int) and context_sentences != "all":
645
- raise ValueError('context_sentences must be an integer (>= 0) or "all".')
646
-
647
- if isinstance(context_sentences, int) and context_sentences < 0:
648
- raise ValueError("context_sentences must be a positive integer.")
649
-
650
- self.context_sentences =context_sentences
651
-
652
-
653
- def _get_sentences(self, text:str) -> List[Dict[str,str]]:
654
- """
655
- This method sentence tokenize the input text into a list of sentences
656
- as dict of {start, end, sentence_text}
657
-
658
- Parameters:
659
- ----------
660
- text : str
661
- text to sentence tokenize.
662
-
663
- Returns : List[Dict[str,str]]
664
- a list of sentences as dict with keys: {"sentence_text", "start", "end"}.
665
- """
666
- sentences = []
667
- for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
668
- sentences.append({"sentence_text": text[start:end],
669
- "start": start,
670
- "end": end})
671
- return sentences
672
-
673
-
674
- def _get_context_sentences(self, text_content, i:int, sentences:List[Dict[str, str]], document_key:str=None) -> str:
675
- """
676
- This function returns the context sentences for the current sentence of interest (i).
677
- """
678
- if self.context_sentences == "all":
679
- context = text_content if isinstance(text_content, str) else text_content[document_key]
680
- elif self.context_sentences == 0:
681
- context = ""
682
- else:
683
- start = max(0, i - self.context_sentences)
684
- end = min(i + 1 + self.context_sentences, len(sentences))
685
- context = " ".join([s['sentence_text'] for s in sentences[start:end]])
686
- return context
687
-
688
-
689
- def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
690
- document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
691
- """
692
- This method inputs a text and outputs a list of outputs per sentence.
693
-
694
- Parameters:
695
- ----------
696
- text_content : Union[str, Dict[str,str]]
697
- the input text content to put in prompt template.
698
- If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
699
- If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
700
- max_new_tokens : str, Optional
701
- the max number of new tokens LLM should generate.
702
- document_key : str, Optional
703
- specify the key in text_content where document text is.
704
- If text_content is str, this parameter will be ignored.
705
- temperature : float, Optional
706
- the temperature for token sampling.
707
- stream : bool, Optional
708
- if True, LLM generated text will be printed in terminal in real-time.
709
- return_messages_log : bool, Optional
710
- if True, a list of messages will be returned.
711
-
712
- Return : str
713
- the output from LLM. Need post-processing.
714
- """
715
- # define output
716
- output = []
717
- # sentence tokenization
718
- if isinstance(text_content, str):
719
- sentences = self._get_sentences(text_content)
720
- elif isinstance(text_content, dict):
721
- if document_key is None:
722
- raise ValueError("document_key must be provided when text_content is dict.")
723
- sentences = self._get_sentences(text_content[document_key])
724
-
725
- if return_messages_log:
726
- messages_log = []
727
-
728
- # generate sentence by sentence
729
- for i, sent in enumerate(sentences):
986
+ # generate unit by unit
987
+ for i, unit in enumerate(units):
988
+ # <--- Initial generation step --->
730
989
  # construct chat messages
731
990
  messages = []
732
991
  if self.system_prompt:
733
992
  messages.append({'role': 'system', 'content': self.system_prompt})
734
993
 
735
- context = self._get_context_sentences(text_content, i, sentences, document_key)
994
+ context = self.context_chunker.chunk(unit)
736
995
 
737
- if self.context_sentences == 0:
738
- # no context, just place sentence of interest
996
+ if context == "":
997
+ # no context, just place unit in user prompt
739
998
  if isinstance(text_content, str):
740
- messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
999
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
741
1000
  else:
742
- sentence_content = text_content.copy()
743
- sentence_content[document_key] = sent['sentence_text']
744
- messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
1001
+ unit_content = text_content.copy()
1002
+ unit_content[document_key] = unit.text
1003
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
745
1004
  else:
746
- # insert context
1005
+ # insert context to user prompt
747
1006
  if isinstance(text_content, str):
748
1007
  messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
749
1008
  else:
750
1009
  context_content = text_content.copy()
751
1010
  context_content[document_key] = context
752
1011
  messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
753
- # simulate conversation
754
- messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
755
- # place sentence of interest
756
- messages.append({'role': 'user', 'content': sent['sentence_text']})
757
-
758
- if stream:
759
- print(f"\n\n{Fore.GREEN}Sentence {i}:{Style.RESET_ALL}\n{sent['sentence_text']}\n")
760
- if isinstance(self.context_sentences, int) and self.context_sentences > 0:
1012
+ # simulate conversation where assistant confirms
1013
+ messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
1014
+ # place unit of interest
1015
+ messages.append({'role': 'user', 'content': unit.text})
1016
+
1017
+ if verbose:
1018
+ print(f"\n\n{Fore.GREEN}Unit {i}:{Style.RESET_ALL}\n{unit.text}\n")
1019
+ if context != "":
761
1020
  print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
762
1021
 
763
1022
  print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
764
1023
 
765
- gen_text = self.inference_engine.chat(
766
- messages=messages,
767
- max_new_tokens=max_new_tokens,
768
- temperature=temperature,
769
- stream=stream,
770
- **kwrs
771
- )
1024
+ response_stream = self.inference_engine.chat(
1025
+ messages=messages,
1026
+ max_new_tokens=max_new_tokens,
1027
+ temperature=temperature,
1028
+ stream=True,
1029
+ **kwrs
1030
+ )
1031
+
1032
+ initial = ""
1033
+ for chunk in response_stream:
1034
+ initial += chunk
1035
+ print(chunk, end='', flush=True)
772
1036
 
1037
+ else:
1038
+ initial = self.inference_engine.chat(
1039
+ messages=messages,
1040
+ max_new_tokens=max_new_tokens,
1041
+ temperature=temperature,
1042
+ stream=False,
1043
+ **kwrs
1044
+ )
1045
+
773
1046
  if return_messages_log:
774
- messages.append({"role": "assistant", "content": gen_text})
1047
+ messages.append({"role": "assistant", "content": initial})
775
1048
  messages_log.append(messages)
776
1049
 
777
- # add to output
778
- output.append({'sentence_start': sent['start'],
779
- 'sentence_end': sent['end'],
780
- 'sentence_text': sent['sentence_text'],
781
- 'gen_text': gen_text})
782
-
783
- if return_messages_log:
784
- return output, messages_log
785
-
786
- return output
787
-
788
-
789
- async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
790
- document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32,
791
- return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
792
- """
793
- The asynchronous version of the extract() method.
794
-
795
- Parameters:
796
- ----------
797
- text_content : Union[str, Dict[str,str]]
798
- the input text content to put in prompt template.
799
- If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
800
- If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
801
- max_new_tokens : str, Optional
802
- the max number of new tokens LLM should generate.
803
- document_key : str, Optional
804
- specify the key in text_content where document text is.
805
- If text_content is str, this parameter will be ignored.
806
- temperature : float, Optional
807
- the temperature for token sampling.
808
- concurrent_batch_size : int, Optional
809
- the number of sentences to process in concurrent.
810
- return_messages_log : bool, Optional
811
- if True, a list of messages will be returned.
812
-
813
- Return : str
814
- the output from LLM. Need post-processing.
815
- """
816
- # Check if self.inference_engine.chat_async() is implemented
817
- if not hasattr(self.inference_engine, 'chat_async'):
818
- raise NotImplementedError(f"{self.inference_engine.__class__.__name__} does not have chat_async() method.")
819
-
820
- # define output
821
- output = []
822
- # sentence tokenization
823
- if isinstance(text_content, str):
824
- sentences = self._get_sentences(text_content)
825
- elif isinstance(text_content, dict):
826
- if document_key is None:
827
- raise ValueError("document_key must be provided when text_content is dict.")
828
- sentences = self._get_sentences(text_content[document_key])
829
-
830
- if return_messages_log:
831
- messages_log = []
1050
+ # <--- Review step --->
1051
+ if verbose:
1052
+ print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
832
1053
 
833
- # generate sentence by sentence
834
- for i in range(0, len(sentences), concurrent_batch_size):
835
- tasks = []
836
- batch = sentences[i:i + concurrent_batch_size]
837
- batch_messages = []
838
- for j, sent in enumerate(batch):
839
- # construct chat messages
840
- messages = []
841
- if self.system_prompt:
842
- messages.append({'role': 'system', 'content': self.system_prompt})
1054
+ messages.append({'role': 'assistant', 'content': initial})
1055
+ messages.append({'role': 'user', 'content': self.review_prompt})
843
1056
 
844
- context = self._get_context_sentences(text_content, i + j, sentences, document_key)
845
-
846
- if self.context_sentences == 0:
847
- # no context, just place sentence of interest
848
- if isinstance(text_content, str):
849
- messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
850
- else:
851
- sentence_content = text_content.copy()
852
- sentence_content[document_key] = sent['sentence_text']
853
- messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
854
- else:
855
- # insert context
856
- if isinstance(text_content, str):
857
- messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
858
- else:
859
- context_content = text_content.copy()
860
- context_content[document_key] = context
861
- messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
862
- # simulate conversation
863
- messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
864
- # place sentence of interest
865
- messages.append({'role': 'user', 'content': sent['sentence_text']})
1057
+ if verbose:
1058
+ response_stream = self.inference_engine.chat(
1059
+ messages=messages,
1060
+ max_new_tokens=max_new_tokens,
1061
+ temperature=temperature,
1062
+ stream=True,
1063
+ **kwrs
1064
+ )
866
1065
 
867
- # add to tasks
868
- task = asyncio.create_task(
869
- self.inference_engine.chat_async(
1066
+ review = ""
1067
+ for chunk in response_stream:
1068
+ review += chunk
1069
+ print(chunk, end='', flush=True)
1070
+
1071
+ else:
1072
+ review = self.inference_engine.chat(
870
1073
  messages=messages,
871
1074
  max_new_tokens=max_new_tokens,
872
1075
  temperature=temperature,
1076
+ stream=False,
873
1077
  **kwrs
874
1078
  )
875
- )
876
- tasks.append(task)
877
- batch_messages.append(messages)
878
1079
 
879
- # Wait until the batch is done, collect results and move on to next batch
880
- responses = await asyncio.gather(*tasks)
1080
+ # Output
1081
+ if self.review_mode == "revision":
1082
+ gen_text = review
1083
+ elif self.review_mode == "addition":
1084
+ gen_text = initial + '\n' + review
881
1085
 
882
- # Collect outputs
883
- for gen_text, sent, messages in zip(responses, batch, batch_messages):
884
- if return_messages_log:
885
- messages.append({"role": "assistant", "content": gen_text})
886
- messages_log.append(messages)
1086
+ if return_messages_log:
1087
+ messages.append({"role": "assistant", "content": review})
1088
+ messages_log.append(messages)
887
1089
 
888
- output.append({'sentence_start': sent['start'],
889
- 'sentence_end': sent['end'],
890
- 'sentence_text': sent['sentence_text'],
891
- 'gen_text': gen_text})
892
-
1090
+ # add to output
1091
+ result = FrameExtractionUnitResult(
1092
+ start=unit.start,
1093
+ end=unit.end,
1094
+ text=unit.text,
1095
+ gen_text=gen_text)
1096
+ output.append(result)
1097
+
893
1098
  if return_messages_log:
894
1099
  return output, messages_log
895
1100
 
896
1101
  return output
897
-
898
1102
 
899
- def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
900
- document_key:str=None, temperature:float=0.0, stream:bool=False,
901
- concurrent:bool=False, concurrent_batch_size:int=32,
902
- case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
903
- allow_overlap_entities:bool=False, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
904
- """
905
- This method inputs a text and outputs a list of LLMInformationExtractionFrame
906
- It use the extract() method and post-process outputs into frames.
907
1103
 
908
- Parameters:
909
- ----------
910
- text_content : Union[str, Dict[str,str]]
911
- the input text content to put in prompt template.
912
- If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
913
- If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
914
- entity_key : str
915
- the key (in ouptut JSON) for entity text.
916
- max_new_tokens : str, Optional
917
- the max number of new tokens LLM should generate.
918
- document_key : str, Optional
919
- specify the key in text_content where document text is.
920
- If text_content is str, this parameter will be ignored.
921
- temperature : float, Optional
922
- the temperature for token sampling.
923
- stream : bool, Optional
924
- if True, LLM generated text will be printed in terminal in real-time.
925
- concurrent : bool, Optional
926
- if True, the sentences will be extracted in concurrent.
927
- concurrent_batch_size : int, Optional
928
- the number of sentences to process in concurrent. Only used when `concurrent` is True.
929
- case_sensitive : bool, Optional
930
- if True, entity text matching will be case-sensitive.
931
- fuzzy_match : bool, Optional
932
- if True, fuzzy matching will be applied to find entity text.
933
- fuzzy_buffer_size : float, Optional
934
- the buffer size for fuzzy matching. Default is 20% of entity text length.
935
- fuzzy_score_cutoff : float, Optional
936
- the Jaccard score cutoff for fuzzy matching.
937
- Matched entity text must have a score higher than this value or a None will be returned.
938
- allow_overlap_entities : bool, Optional
939
- if True, entities can overlap in the text.
940
- Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
941
- return_messages_log : bool, Optional
942
- if True, a list of messages will be returned.
943
-
944
- Return : str
945
- a list of frames.
1104
+ def stream(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048,
1105
+ document_key:str=None, temperature:float=0.0, **kwrs) -> Generator[str, None, None]:
946
1106
  """
947
- if concurrent:
948
- if stream:
949
- warnings.warn("stream=True is not supported in concurrent mode.", RuntimeWarning)
950
-
951
- nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
952
- extraction_results = asyncio.run(self.extract_async(text_content=text_content,
953
- max_new_tokens=max_new_tokens,
954
- document_key=document_key,
955
- temperature=temperature,
956
- concurrent_batch_size=concurrent_batch_size,
957
- return_messages_log=return_messages_log,
958
- **kwrs)
959
- )
960
- else:
961
- extraction_results = self.extract(text_content=text_content,
962
- max_new_tokens=max_new_tokens,
963
- document_key=document_key,
964
- temperature=temperature,
965
- stream=stream,
966
- return_messages_log=return_messages_log,
967
- **kwrs)
968
-
969
- llm_output_sentences, messages_log = extraction_results if return_messages_log else (extraction_results, None)
970
-
971
- frame_list = []
972
- for sent in llm_output_sentences:
973
- entity_json = []
974
- for entity in self._extract_json(gen_text=sent['gen_text']):
975
- if entity_key in entity:
976
- entity_json.append(entity)
977
- else:
978
- warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
979
-
980
- spans = self._find_entity_spans(text=sent['sentence_text'],
981
- entities=[e[entity_key] for e in entity_json],
982
- case_sensitive=case_sensitive,
983
- fuzzy_match=fuzzy_match,
984
- fuzzy_buffer_size=fuzzy_buffer_size,
985
- fuzzy_score_cutoff=fuzzy_score_cutoff,
986
- allow_overlap_entities=allow_overlap_entities)
987
- for ent, span in zip(entity_json, spans):
988
- if span is not None:
989
- start, end = span
990
- entity_text = sent['sentence_text'][start:end]
991
- start += sent['sentence_start']
992
- end += sent['sentence_start']
993
- frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
994
- start=start,
995
- end=end,
996
- entity_text=entity_text,
997
- attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
998
- frame_list.append(frame)
999
-
1000
- if return_messages_log:
1001
- return frame_list, messages_log
1002
- return frame_list
1003
-
1004
-
1005
- class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1006
- def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
1007
- review_mode:str, review_prompt:str=None, system_prompt:str=None,
1008
- context_sentences:Union[str, int]="all", **kwrs):
1009
- """
1010
- This class adds a review step after the SentenceFrameExtractor.
1011
- For each sentence, the review process asks LLM to review its output and:
1012
- 1. add more frames while keeping current. This is efficient for boosting recall.
1013
- 2. or, regenerate frames (add new and delete existing).
1014
- Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
1015
-
1016
- Parameters:
1017
- ----------
1018
- inference_engine : InferenceEngine
1019
- the LLM inferencing engine object. Must implements the chat() method.
1020
- prompt_template : str
1021
- prompt template with "{{<placeholder name>}}" placeholder.
1022
- review_prompt : str: Optional
1023
- the prompt text that ask LLM to review. Specify addition or revision in the instruction.
1024
- if not provided, a default review prompt will be used.
1025
- review_mode : str
1026
- review mode. Must be one of {"addition", "revision"}
1027
- addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
1028
- system_prompt : str, Optional
1029
- system prompt.
1030
- context_sentences : Union[str, int], Optional
1031
- number of sentences before and after the given sentence to provide additional context.
1032
- if "all", the full text will be provided in the prompt as context.
1033
- if 0, no additional context will be provided.
1034
- This is good for tasks that does not require context beyond the given sentence.
1035
- if > 0, the number of sentences before and after the given sentence to provide as context.
1036
- This is good for tasks that require context beyond the given sentence.
1037
- """
1038
- super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
1039
- system_prompt=system_prompt, context_sentences=context_sentences, **kwrs)
1040
-
1041
- if review_mode not in {"addition", "revision"}:
1042
- raise ValueError('review_mode must be one of {"addition", "revision"}.')
1043
- self.review_mode = review_mode
1044
-
1045
- if review_prompt:
1046
- self.review_prompt = review_prompt
1047
- else:
1048
- file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
1049
- joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
1050
- with open(file_path, 'r', encoding="utf-8") as f:
1051
- self.review_prompt = f.read()
1052
-
1053
- warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
1054
-
1055
-
1056
- def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
1057
- document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
1058
- """
1059
- This method inputs a text and outputs a list of outputs per sentence.
1107
+ This method inputs a text and outputs a list of outputs per unit.
1060
1108
 
1061
1109
  Parameters:
1062
1110
  ----------
@@ -1064,281 +1112,371 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
1064
1112
  the input text content to put in prompt template.
1065
1113
  If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
1066
1114
  If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
1067
- max_new_tokens : str, Optional
1115
+ max_new_tokens : int, Optional
1068
1116
  the max number of new tokens LLM should generate.
1069
1117
  document_key : str, Optional
1070
1118
  specify the key in text_content where document text is.
1071
1119
  If text_content is str, this parameter will be ignored.
1072
1120
  temperature : float, Optional
1073
1121
  the temperature for token sampling.
1074
- stream : bool, Optional
1075
- if True, LLM generated text will be printed in terminal in real-time.
1076
- return_messages_log : bool, Optional
1077
- if True, a list of messages will be returned.
1078
1122
 
1079
- Return : str
1080
- the output from LLM. Need post-processing.
1123
+ Return : List[FrameExtractionUnitResult]
1124
+ the output from LLM for each unit. Contains the start, end, text, and generated text.
1081
1125
  """
1082
- # define output
1083
- output = []
1084
- # sentence tokenization
1126
+ # unit chunking
1085
1127
  if isinstance(text_content, str):
1086
- sentences = self._get_sentences(text_content)
1128
+ doc_text = text_content
1129
+
1087
1130
  elif isinstance(text_content, dict):
1088
1131
  if document_key is None:
1089
1132
  raise ValueError("document_key must be provided when text_content is dict.")
1090
- sentences = self._get_sentences(text_content[document_key])
1133
+ doc_text = text_content[document_key]
1091
1134
 
1092
- if return_messages_log:
1093
- messages_log = []
1135
+ units = self.unit_chunker.chunk(doc_text)
1136
+ # context chunker init
1137
+ self.context_chunker.fit(doc_text, units)
1094
1138
 
1095
- # generate sentence by sentence
1096
- for i, sent in enumerate(sentences):
1139
+ # generate unit by unit
1140
+ for i, unit in enumerate(units):
1141
+ # <--- Initial generation step --->
1097
1142
  # construct chat messages
1098
1143
  messages = []
1099
1144
  if self.system_prompt:
1100
1145
  messages.append({'role': 'system', 'content': self.system_prompt})
1101
1146
 
1102
- context = self._get_context_sentences(text_content, i, sentences, document_key)
1147
+ context = self.context_chunker.chunk(unit)
1103
1148
 
1104
- if self.context_sentences == 0:
1105
- # no context, just place sentence of interest
1149
+ if context == "":
1150
+ # no context, just place unit in user prompt
1106
1151
  if isinstance(text_content, str):
1107
- messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
1152
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
1108
1153
  else:
1109
- sentence_content = text_content.copy()
1110
- sentence_content[document_key] = sent['sentence_text']
1111
- messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
1154
+ unit_content = text_content.copy()
1155
+ unit_content[document_key] = unit.text
1156
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
1112
1157
  else:
1113
- # insert context
1158
+ # insert context to user prompt
1114
1159
  if isinstance(text_content, str):
1115
1160
  messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1116
1161
  else:
1117
1162
  context_content = text_content.copy()
1118
1163
  context_content[document_key] = context
1119
1164
  messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
1120
- # simulate conversation
1121
- messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
1122
- # place sentence of interest
1123
- messages.append({'role': 'user', 'content': sent['sentence_text']})
1165
+ # simulate conversation where assistant confirms
1166
+ messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
1167
+ # place unit of interest
1168
+ messages.append({'role': 'user', 'content': unit.text})
1124
1169
 
1125
- if stream:
1126
- print(f"\n\n{Fore.GREEN}Sentence {i}: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
1127
- if isinstance(self.context_sentences, int) and self.context_sentences > 0:
1128
- print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
1129
- print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
1130
1170
 
1131
- initial = self.inference_engine.chat(
1171
+ yield f"\n\n{Fore.GREEN}Unit {i}:{Style.RESET_ALL}\n{unit.text}\n"
1172
+ if context != "":
1173
+ yield f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n"
1174
+
1175
+ yield f"{Fore.BLUE}Extraction:{Style.RESET_ALL}\n"
1176
+
1177
+ response_stream = self.inference_engine.chat(
1132
1178
  messages=messages,
1133
1179
  max_new_tokens=max_new_tokens,
1134
1180
  temperature=temperature,
1135
- stream=stream,
1181
+ stream=True,
1136
1182
  **kwrs
1137
1183
  )
1138
1184
 
1139
- # Review
1140
- if stream:
1141
- print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
1185
+ initial = ""
1186
+ for chunk in response_stream:
1187
+ initial += chunk
1188
+ yield chunk
1189
+
1190
+ # <--- Review step --->
1191
+ yield f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}"
1142
1192
 
1143
1193
  messages.append({'role': 'assistant', 'content': initial})
1144
1194
  messages.append({'role': 'user', 'content': self.review_prompt})
1145
1195
 
1146
- review = self.inference_engine.chat(
1196
+ response_stream = self.inference_engine.chat(
1147
1197
  messages=messages,
1148
1198
  max_new_tokens=max_new_tokens,
1149
1199
  temperature=temperature,
1150
- stream=stream,
1200
+ stream=True,
1151
1201
  **kwrs
1152
1202
  )
1153
1203
 
1154
- # Output
1155
- if self.review_mode == "revision":
1156
- gen_text = review
1157
- elif self.review_mode == "addition":
1158
- gen_text = initial + '\n' + review
1159
-
1160
- if return_messages_log:
1161
- messages.append({"role": "assistant", "content": review})
1162
- messages_log.append(messages)
1204
+ for chunk in response_stream:
1205
+ yield chunk
1163
1206
 
1164
- # add to output
1165
- output.append({'sentence_start': sent['start'],
1166
- 'sentence_end': sent['end'],
1167
- 'sentence_text': sent['sentence_text'],
1168
- 'gen_text': gen_text})
1169
-
1170
- if return_messages_log:
1171
- return output, messages_log
1172
-
1173
- return output
1174
-
1175
- async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
1176
- document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
1207
+ async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, document_key:str=None, temperature:float=0.0,
1208
+ concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnitResult]:
1177
1209
  """
1178
- The asynchronous version of the extract() method.
1210
+ This is the asynchronous version of the extract() method with the review step.
1179
1211
 
1180
1212
  Parameters:
1181
1213
  ----------
1182
1214
  text_content : Union[str, Dict[str,str]]
1183
- the input text content to put in prompt template.
1215
+ the input text content to put in prompt template.
1184
1216
  If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
1185
1217
  If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
1186
- max_new_tokens : str, Optional
1187
- the max number of new tokens LLM should generate.
1218
+ max_new_tokens : int, Optional
1219
+ the max number of new tokens LLM should generate.
1188
1220
  document_key : str, Optional
1189
- specify the key in text_content where document text is.
1221
+ specify the key in text_content where document text is.
1190
1222
  If text_content is str, this parameter will be ignored.
1191
1223
  temperature : float, Optional
1192
1224
  the temperature for token sampling.
1193
1225
  concurrent_batch_size : int, Optional
1194
- the number of sentences to process in concurrent.
1226
+ the batch size for concurrent processing.
1195
1227
  return_messages_log : bool, Optional
1196
- if True, a list of messages will be returned.
1228
+ if True, a list of messages will be returned, including review steps.
1197
1229
 
1198
- Return : str
1199
- the output from LLM. Need post-processing.
1230
+ Return : List[FrameExtractionUnitResult]
1231
+ the output from LLM for each unit after review. Contains the start, end, text, and generated text.
1200
1232
  """
1201
- # Check if self.inference_engine.chat_async() is implemented
1202
- if not hasattr(self.inference_engine, 'chat_async'):
1203
- raise NotImplementedError(f"{self.inference_engine.__class__.__name__} does not have chat_async() method.")
1204
-
1205
- # define output
1206
- output = []
1207
- # sentence tokenization
1208
1233
  if isinstance(text_content, str):
1209
- sentences = self._get_sentences(text_content)
1234
+ doc_text = text_content
1210
1235
  elif isinstance(text_content, dict):
1211
1236
  if document_key is None:
1212
1237
  raise ValueError("document_key must be provided when text_content is dict.")
1213
- sentences = self._get_sentences(text_content[document_key])
1238
+ if document_key not in text_content:
1239
+ raise ValueError(f"document_key '{document_key}' not found in text_content dictionary.")
1240
+ doc_text = text_content[document_key]
1241
+ else:
1242
+ raise TypeError("text_content must be a string or a dictionary.")
1214
1243
 
1215
- if return_messages_log:
1216
- messages_log = []
1244
+ units = self.unit_chunker.chunk(doc_text)
1217
1245
 
1218
- # generate initial outputs sentence by sentence
1219
- for i in range(0, len(sentences), concurrent_batch_size):
1220
- messages_list = []
1221
- init_tasks = []
1222
- review_tasks = []
1223
- batch = sentences[i:i + concurrent_batch_size]
1224
- for j, sent in enumerate(batch):
1225
- # construct chat messages
1226
- messages = []
1227
- if self.system_prompt:
1228
- messages.append({'role': 'system', 'content': self.system_prompt})
1246
+ # context chunker init
1247
+ self.context_chunker.fit(doc_text, units)
1229
1248
 
1230
- context = self._get_context_sentences(text_content, i + j, sentences, document_key)
1231
-
1232
- if self.context_sentences == 0:
1233
- # no context, just place sentence of interest
1234
- if isinstance(text_content, str):
1235
- messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
1236
- else:
1237
- sentence_content = text_content.copy()
1238
- sentence_content[document_key] = sent['sentence_text']
1239
- messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
1249
+ # <--- Initial generation step --->
1250
+ initial_tasks_input = []
1251
+ for i, unit in enumerate(units):
1252
+ # construct chat messages for initial generation
1253
+ messages = []
1254
+ if self.system_prompt:
1255
+ messages.append({'role': 'system', 'content': self.system_prompt})
1256
+
1257
+ context = self.context_chunker.chunk(unit)
1258
+
1259
+ if context == "":
1260
+ # no context, just place unit in user prompt
1261
+ if isinstance(text_content, str):
1262
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
1240
1263
  else:
1241
- # insert context
1242
- if isinstance(text_content, str):
1243
- messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1244
- else:
1245
- context_content = text_content.copy()
1246
- context_content[document_key] = context
1247
- messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
1248
- # simulate conversation
1249
- messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
1250
- # place sentence of interest
1251
- messages.append({'role': 'user', 'content': sent['sentence_text']})
1252
-
1253
- messages_list.append(messages)
1254
-
1255
- task = asyncio.create_task(
1256
- self.inference_engine.chat_async(
1257
- messages=messages,
1258
- max_new_tokens=max_new_tokens,
1259
- temperature=temperature,
1260
- **kwrs
1261
- )
1264
+ unit_content = text_content.copy()
1265
+ unit_content[document_key] = unit.text
1266
+ messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
1267
+ else:
1268
+ # insert context to user prompt
1269
+ if isinstance(text_content, str):
1270
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1271
+ else:
1272
+ context_content = text_content.copy()
1273
+ context_content[document_key] = context
1274
+ messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
1275
+ # simulate conversation where assistant confirms
1276
+ messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
1277
+ # place unit of interest
1278
+ messages.append({'role': 'user', 'content': unit.text})
1279
+
1280
+ # Store unit and messages together for the initial task
1281
+ initial_tasks_input.append({"unit": unit, "messages": messages, "original_index": i})
1282
+
1283
+ semaphore = asyncio.Semaphore(concurrent_batch_size)
1284
+
1285
+ async def initial_semaphore_helper(task_data: Dict, max_new_tokens: int, temperature: float, **kwrs):
1286
+ unit = task_data["unit"]
1287
+ messages = task_data["messages"]
1288
+ original_index = task_data["original_index"]
1289
+
1290
+ async with semaphore:
1291
+ gen_text = await self.inference_engine.chat_async(
1292
+ messages=messages,
1293
+ max_new_tokens=max_new_tokens,
1294
+ temperature=temperature,
1295
+ **kwrs
1262
1296
  )
1263
- init_tasks.append(task)
1264
-
1265
- # Wait until the batch is done, collect results and move on to next batch
1266
- init_responses = await asyncio.gather(*init_tasks)
1267
- # Collect initials
1268
- initials = []
1269
- for gen_text, sent, messages in zip(init_responses, batch, messages_list):
1270
- initials.append({'sentence_start': sent['start'],
1271
- 'sentence_end': sent['end'],
1272
- 'sentence_text': sent['sentence_text'],
1273
- 'gen_text': gen_text,
1274
- 'messages': messages})
1275
-
1276
- # Review
1277
- for init in initials:
1278
- messages = init["messages"]
1279
- initial = init["gen_text"]
1280
- messages.append({'role': 'assistant', 'content': initial})
1281
- messages.append({'role': 'user', 'content': self.review_prompt})
1282
- task = asyncio.create_task(
1283
- self.inference_engine.chat_async(
1284
- messages=messages,
1285
- max_new_tokens=max_new_tokens,
1286
- temperature=temperature,
1287
- **kwrs
1288
- )
1289
- )
1290
- review_tasks.append(task)
1291
-
1292
- review_responses = await asyncio.gather(*review_tasks)
1293
-
1294
- # Collect reviews
1295
- reviews = []
1296
- for gen_text, sent in zip(review_responses, batch):
1297
- reviews.append({'sentence_start': sent['start'],
1298
- 'sentence_end': sent['end'],
1299
- 'sentence_text': sent['sentence_text'],
1300
- 'gen_text': gen_text})
1301
-
1302
- for init, rev in zip(initials, reviews):
1303
- if self.review_mode == "revision":
1304
- gen_text = rev['gen_text']
1305
- elif self.review_mode == "addition":
1306
- gen_text = init['gen_text'] + '\n' + rev['gen_text']
1297
+ # Return initial generation result along with the messages used and the unit
1298
+ return {"original_index": original_index, "unit": unit, "initial_gen_text": gen_text, "initial_messages": messages}
1299
+
1300
+ # Create and gather initial generation tasks
1301
+ initial_tasks = [
1302
+ asyncio.create_task(initial_semaphore_helper(
1303
+ task_inp,
1304
+ max_new_tokens=max_new_tokens,
1305
+ temperature=temperature,
1306
+ **kwrs
1307
+ ))
1308
+ for task_inp in initial_tasks_input
1309
+ ]
1310
+
1311
+ initial_results_raw = await asyncio.gather(*initial_tasks)
1312
+
1313
+ # Sort initial results back into original order
1314
+ initial_results_raw.sort(key=lambda x: x["original_index"])
1315
+
1316
+ # <--- Review step --->
1317
+ review_tasks_input = []
1318
+ for result_data in initial_results_raw:
1319
+ # Prepare messages for the review step
1320
+ initial_messages = result_data["initial_messages"]
1321
+ initial_gen_text = result_data["initial_gen_text"]
1322
+ review_messages = initial_messages + [
1323
+ {'role': 'assistant', 'content': initial_gen_text},
1324
+ {'role': 'user', 'content': self.review_prompt}
1325
+ ]
1326
+ # Store data needed for review task
1327
+ review_tasks_input.append({
1328
+ "unit": result_data["unit"],
1329
+ "initial_gen_text": initial_gen_text,
1330
+ "messages": review_messages,
1331
+ "original_index": result_data["original_index"],
1332
+ "full_initial_log": initial_messages + [{'role': 'assistant', 'content': initial_gen_text}] if return_messages_log else None # Log up to initial generation
1333
+ })
1334
+
1335
+
1336
+ async def review_semaphore_helper(task_data: Dict, max_new_tokens: int, temperature: float, **kwrs):
1337
+ messages = task_data["messages"]
1338
+ original_index = task_data["original_index"]
1339
+
1340
+ async with semaphore:
1341
+ review_gen_text = await self.inference_engine.chat_async(
1342
+ messages=messages,
1343
+ max_new_tokens=max_new_tokens,
1344
+ temperature=temperature,
1345
+ **kwrs
1346
+ )
1347
+ # Combine initial and review results
1348
+ task_data["review_gen_text"] = review_gen_text
1349
+ if return_messages_log:
1350
+ # Log for the review call itself
1351
+ task_data["full_review_log"] = messages + [{'role': 'assistant', 'content': review_gen_text}]
1352
+ return task_data # Return the augmented dictionary
1353
+
1354
+ # Create and gather review tasks
1355
+ review_tasks = [
1356
+ asyncio.create_task(review_semaphore_helper(
1357
+ task_inp,
1358
+ max_new_tokens=max_new_tokens,
1359
+ temperature=temperature,
1360
+ **kwrs
1361
+ ))
1362
+ for task_inp in review_tasks_input
1363
+ ]
1364
+
1365
+ final_results_raw = await asyncio.gather(*review_tasks)
1366
+
1367
+ # Sort final results back into original order (although gather might preserve order for tasks added sequentially)
1368
+ final_results_raw.sort(key=lambda x: x["original_index"])
1369
+
1370
+ # <--- Process final results --->
1371
+ output: List[FrameExtractionUnitResult] = []
1372
+ messages_log: Optional[List[List[Dict[str, str]]]] = [] if return_messages_log else None
1373
+
1374
+ for result_data in final_results_raw:
1375
+ unit = result_data["unit"]
1376
+ initial_gen = result_data["initial_gen_text"]
1377
+ review_gen = result_data["review_gen_text"]
1378
+
1379
+ # Combine based on review mode
1380
+ if self.review_mode == "revision":
1381
+ final_gen_text = review_gen
1382
+ elif self.review_mode == "addition":
1383
+ final_gen_text = initial_gen + '\n' + review_gen
1384
+ else: # Should not happen due to init check
1385
+ final_gen_text = review_gen # Default to revision if mode is somehow invalid
1386
+
1387
+ # Create final result object
1388
+ result = FrameExtractionUnitResult(
1389
+ start=unit.start,
1390
+ end=unit.end,
1391
+ text=unit.text,
1392
+ gen_text=final_gen_text # Use the combined/reviewed text
1393
+ )
1394
+ output.append(result)
1395
+
1396
+ # Append full conversation log if requested
1397
+ if return_messages_log:
1398
+ full_log_for_unit = result_data.get("full_initial_log", []) + [{'role': 'user', 'content': self.review_prompt}] + [{'role': 'assistant', 'content': review_gen}]
1399
+ messages_log.append(full_log_for_unit)
1307
1400
 
1308
- if return_messages_log:
1309
- messages = init["messages"]
1310
- messages.append({"role": "assistant", "content": rev['gen_text']})
1311
- messages_log.append(messages)
1401
+ if return_messages_log:
1402
+ return output, messages_log
1403
+ else:
1404
+ return output
1405
+
1406
+
1407
+ class BasicFrameExtractor(DirectFrameExtractor):
1408
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
1409
+ """
1410
+ This class diretly prompt LLM for frame extraction.
1411
+ Input system prompt (optional), prompt template (with instruction, few-shot examples),
1412
+ and specify a LLM.
1312
1413
 
1313
- # add to output
1314
- output.append({'sentence_start': init['sentence_start'],
1315
- 'sentence_end': init['sentence_end'],
1316
- 'sentence_text': init['sentence_text'],
1317
- 'gen_text': gen_text})
1414
+ Parameters:
1415
+ ----------
1416
+ inference_engine : InferenceEngine
1417
+ the LLM inferencing engine object. Must implements the chat() method.
1418
+ prompt_template : str
1419
+ prompt template with "{{<placeholder name>}}" placeholder.
1420
+ system_prompt : str, Optional
1421
+ system prompt.
1422
+ """
1423
+ super().__init__(inference_engine=inference_engine,
1424
+ unit_chunker=WholeDocumentUnitChunker(),
1425
+ prompt_template=prompt_template,
1426
+ system_prompt=system_prompt,
1427
+ context_chunker=NoContextChunker(),
1428
+ **kwrs)
1318
1429
 
1319
- if return_messages_log:
1320
- return output, messages_log
1321
- return output
1430
+ class BasicReviewFrameExtractor(ReviewFrameExtractor):
1431
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str, review_mode:str, review_prompt:str=None, system_prompt:str=None, **kwrs):
1432
+ """
1433
+ This class add a review step after the BasicFrameExtractor.
1434
+ The Review process asks LLM to review its output and:
1435
+ 1. add more frames while keep current. This is efficient for boosting recall.
1436
+ 2. or, regenerate frames (add new and delete existing).
1437
+ Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
1438
+
1439
+ Parameters:
1440
+ ----------
1441
+ inference_engine : InferenceEngine
1442
+ the LLM inferencing engine object. Must implements the chat() method.
1443
+ prompt_template : str
1444
+ prompt template with "{{<placeholder name>}}" placeholder.
1445
+ review_prompt : str: Optional
1446
+ the prompt text that ask LLM to review. Specify addition or revision in the instruction.
1447
+ if not provided, a default review prompt will be used.
1448
+ review_mode : str
1449
+ review mode. Must be one of {"addition", "revision"}
1450
+ addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
1451
+ system_prompt : str, Optional
1452
+ system prompt.
1453
+ """
1454
+ super().__init__(inference_engine=inference_engine,
1455
+ unit_chunker=WholeDocumentUnitChunker(),
1456
+ prompt_template=prompt_template,
1457
+ review_mode=review_mode,
1458
+ review_prompt=review_prompt,
1459
+ system_prompt=system_prompt,
1460
+ context_chunker=NoContextChunker(),
1461
+ **kwrs)
1322
1462
 
1323
1463
 
1324
- class SentenceCoTFrameExtractor(SentenceFrameExtractor):
1325
- from nltk.tokenize.punkt import PunktSentenceTokenizer
1326
- def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
1464
+ class SentenceFrameExtractor(DirectFrameExtractor):
1465
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
1327
1466
  context_sentences:Union[str, int]="all", **kwrs):
1328
1467
  """
1329
- This class performs sentence-based Chain-of-thoughts (CoT) information extraction.
1330
- A simulated chat follows this process:
1468
+ This class performs sentence-by-sentence information extraction.
1469
+ The process is as follows:
1331
1470
  1. system prompt (optional)
1332
- 2. user instructions (schema, background, full text, few-shot example...)
1333
- 3. user input first sentence
1334
- 4. assistant analyze the sentence
1335
- 5. assistant extract outputs
1336
- 6. repeat #3, #4, #5
1471
+ 2. user prompt with instructions (schema, background, full text, few-shot example...)
1472
+ 3. feed a sentence (start with first sentence)
1473
+ 4. LLM extract entities and attributes from the sentence
1474
+ 5. iterate to the next sentence and repeat steps 3-4 until all sentences are processed.
1337
1475
 
1338
1476
  Input system prompt (optional), prompt template (with user instructions),
1339
1477
  and specify a LLM.
1340
1478
 
1341
- Parameters
1479
+ Parameters:
1342
1480
  ----------
1343
1481
  inference_engine : InferenceEngine
1344
1482
  the LLM inferencing engine object. Must implements the chat() method.
@@ -1354,104 +1492,77 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
1354
1492
  if > 0, the number of sentences before and after the given sentence to provide as context.
1355
1493
  This is good for tasks that require context beyond the given sentence.
1356
1494
  """
1357
- super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
1358
- system_prompt=system_prompt, context_sentences=context_sentences, **kwrs)
1495
+ if not isinstance(context_sentences, int) and context_sentences != "all":
1496
+ raise ValueError('context_sentences must be an integer (>= 0) or "all".')
1497
+
1498
+ if isinstance(context_sentences, int) and context_sentences < 0:
1499
+ raise ValueError("context_sentences must be a positive integer.")
1500
+
1501
+ if isinstance(context_sentences, int):
1502
+ context_chunker = SlideWindowContextChunker(window_size=context_sentences)
1503
+ elif context_sentences == "all":
1504
+ context_chunker = WholeDocumentContextChunker()
1505
+
1506
+ super().__init__(inference_engine=inference_engine,
1507
+ unit_chunker=SentenceUnitChunker(),
1508
+ prompt_template=prompt_template,
1509
+ system_prompt=system_prompt,
1510
+ context_chunker=context_chunker,
1511
+ **kwrs)
1359
1512
 
1360
1513
 
1361
- def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
1362
- document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
1514
+ class SentenceReviewFrameExtractor(ReviewFrameExtractor):
1515
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
1516
+ review_mode:str, review_prompt:str=None, system_prompt:str=None,
1517
+ context_sentences:Union[str, int]="all", **kwrs):
1363
1518
  """
1364
- This method inputs a text and outputs a list of outputs per sentence.
1519
+ This class adds a review step after the SentenceFrameExtractor.
1520
+ For each sentence, the review process asks LLM to review its output and:
1521
+ 1. add more frames while keeping current. This is efficient for boosting recall.
1522
+ 2. or, regenerate frames (add new and delete existing).
1523
+ Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
1365
1524
 
1366
1525
  Parameters:
1367
1526
  ----------
1368
- text_content : Union[str, Dict[str,str]]
1369
- the input text content to put in prompt template.
1370
- If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
1371
- If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
1372
- max_new_tokens : str, Optional
1373
- the max number of new tokens LLM should generate.
1374
- document_key : str, Optional
1375
- specify the key in text_content where document text is.
1376
- If text_content is str, this parameter will be ignored.
1377
- temperature : float, Optional
1378
- the temperature for token sampling.
1379
- stream : bool, Optional
1380
- if True, LLM generated text will be printed in terminal in real-time.
1381
- return_messages_log : bool, Optional
1382
- if True, a list of messages will be returned.
1383
-
1384
- Return : str
1385
- the output from LLM. Need post-processing.
1527
+ inference_engine : InferenceEngine
1528
+ the LLM inferencing engine object. Must implements the chat() method.
1529
+ prompt_template : str
1530
+ prompt template with "{{<placeholder name>}}" placeholder.
1531
+ review_prompt : str: Optional
1532
+ the prompt text that ask LLM to review. Specify addition or revision in the instruction.
1533
+ if not provided, a default review prompt will be used.
1534
+ review_mode : str
1535
+ review mode. Must be one of {"addition", "revision"}
1536
+ addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
1537
+ system_prompt : str, Optional
1538
+ system prompt.
1539
+ context_sentences : Union[str, int], Optional
1540
+ number of sentences before and after the given sentence to provide additional context.
1541
+ if "all", the full text will be provided in the prompt as context.
1542
+ if 0, no additional context will be provided.
1543
+ This is good for tasks that does not require context beyond the given sentence.
1544
+ if > 0, the number of sentences before and after the given sentence to provide as context.
1545
+ This is good for tasks that require context beyond the given sentence.
1386
1546
  """
1387
- # define output
1388
- output = []
1389
- # sentence tokenization
1390
- if isinstance(text_content, str):
1391
- sentences = self._get_sentences(text_content)
1392
- elif isinstance(text_content, dict):
1393
- sentences = self._get_sentences(text_content[document_key])
1394
-
1395
- if return_messages_log:
1396
- messages_log = []
1397
-
1398
- # generate sentence by sentence
1399
- for i, sent in enumerate(sentences):
1400
- # construct chat messages
1401
- messages = []
1402
- if self.system_prompt:
1403
- messages.append({'role': 'system', 'content': self.system_prompt})
1404
-
1405
- context = self._get_context_sentences(text_content, i, sentences, document_key)
1406
-
1407
- if self.context_sentences == 0:
1408
- # no context, just place sentence of interest
1409
- if isinstance(text_content, str):
1410
- messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
1411
- else:
1412
- sentence_content = text_content.copy()
1413
- sentence_content[document_key] = sent['sentence_text']
1414
- messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
1415
- else:
1416
- # insert context
1417
- if isinstance(text_content, str):
1418
- messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
1419
- else:
1420
- context_content = text_content.copy()
1421
- context_content[document_key] = context
1422
- messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
1423
- # simulate conversation
1424
- messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
1425
- # place sentence of interest
1426
- messages.append({'role': 'user', 'content': sent['sentence_text']})
1427
-
1428
- if stream:
1429
- print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
1430
- if isinstance(self.context_sentences, int) and self.context_sentences > 0:
1431
- print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
1432
- print(f"{Fore.BLUE}CoT:{Style.RESET_ALL}")
1433
-
1434
- gen_text = self.inference_engine.chat(
1435
- messages=messages,
1436
- max_new_tokens=max_new_tokens,
1437
- temperature=temperature,
1438
- stream=stream,
1439
- **kwrs
1440
- )
1441
-
1442
- if return_messages_log:
1443
- messages.append({"role": "assistant", "content": gen_text})
1444
- messages_log.append(messages)
1445
-
1446
- # add to output
1447
- output.append({'sentence_start': sent['start'],
1448
- 'sentence_end': sent['end'],
1449
- 'sentence_text': sent['sentence_text'],
1450
- 'gen_text': gen_text})
1547
+ if not isinstance(context_sentences, int) and context_sentences != "all":
1548
+ raise ValueError('context_sentences must be an integer (>= 0) or "all".')
1451
1549
 
1452
- if return_messages_log:
1453
- return output, messages_log
1454
- return output
1550
+ if isinstance(context_sentences, int) and context_sentences < 0:
1551
+ raise ValueError("context_sentences must be a positive integer.")
1552
+
1553
+ if isinstance(context_sentences, int):
1554
+ context_chunker = SlideWindowContextChunker(window_size=context_sentences)
1555
+ elif context_sentences == "all":
1556
+ context_chunker = WholeDocumentContextChunker()
1557
+
1558
+ super().__init__(inference_engine=inference_engine,
1559
+ unit_chunker=SentenceUnitChunker(),
1560
+ prompt_template=prompt_template,
1561
+ review_mode=review_mode,
1562
+ review_prompt=review_prompt,
1563
+ system_prompt=system_prompt,
1564
+ context_chunker=context_chunker,
1565
+ **kwrs)
1455
1566
 
1456
1567
 
1457
1568
  class RelationExtractor(Extractor):