llm-ie 0.4.7__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_ie/__init__.py +4 -2
- llm_ie/asset/default_prompts/BasicReviewFrameExtractor_addition_review_prompt.txt +3 -0
- llm_ie/asset/default_prompts/BasicReviewFrameExtractor_revision_review_prompt.txt +2 -0
- llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +2 -1
- llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +2 -1
- llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +104 -86
- llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt +163 -0
- llm_ie/asset/prompt_guide/DirectFrameExtractor_prompt_guide.txt +163 -0
- llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +103 -85
- llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +103 -86
- llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +103 -86
- llm_ie/chunkers.py +191 -0
- llm_ie/data_types.py +75 -1
- llm_ie/engines.py +274 -183
- llm_ie/extractors.py +961 -850
- llm_ie/prompt_editor.py +39 -6
- llm_ie-1.0.0.dist-info/METADATA +18 -0
- llm_ie-1.0.0.dist-info/RECORD +27 -0
- llm_ie/asset/prompt_guide/SentenceCoTFrameExtractor_prompt_guide.txt +0 -217
- llm_ie-0.4.7.dist-info/METADATA +0 -1219
- llm_ie-0.4.7.dist-info/RECORD +0 -23
- {llm_ie-0.4.7.dist-info → llm_ie-1.0.0.dist-info}/WHEEL +0 -0
llm_ie/extractors.py
CHANGED
|
@@ -8,10 +8,12 @@ import warnings
|
|
|
8
8
|
import itertools
|
|
9
9
|
import asyncio
|
|
10
10
|
import nest_asyncio
|
|
11
|
-
from typing import Set, List, Dict, Tuple, Union, Callable
|
|
12
|
-
from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
11
|
+
from typing import Any, Set, List, Dict, Tuple, Union, Callable, Generator, Optional
|
|
12
|
+
from llm_ie.data_types import FrameExtractionUnit, FrameExtractionUnitResult, LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
13
|
+
from llm_ie.chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker
|
|
14
|
+
from llm_ie.chunkers import ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
|
|
13
15
|
from llm_ie.engines import InferenceEngine
|
|
14
|
-
from colorama import Fore, Style
|
|
16
|
+
from colorama import Fore, Style
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
class Extractor:
|
|
@@ -38,15 +40,46 @@ class Extractor:
|
|
|
38
40
|
def get_prompt_guide(cls) -> str:
|
|
39
41
|
"""
|
|
40
42
|
This method returns the pre-defined prompt guideline for the extractor from the package asset.
|
|
43
|
+
It searches for a guide specific to the current class first, if not found, it will search
|
|
44
|
+
for the guide in its ancestors by traversing the class's method resolution order (MRO).
|
|
41
45
|
"""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
46
|
+
original_class_name = cls.__name__
|
|
47
|
+
|
|
48
|
+
for current_class_in_mro in cls.__mro__:
|
|
49
|
+
if current_class_in_mro is object:
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
current_class_name = current_class_in_mro.__name__
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
file_path_obj = importlib.resources.files('llm_ie.asset.prompt_guide').joinpath(f"{current_class_name}_prompt_guide.txt")
|
|
56
|
+
|
|
57
|
+
with open(file_path_obj, 'r', encoding="utf-8") as f:
|
|
58
|
+
prompt_content = f.read()
|
|
59
|
+
# If the guide was found for an ancestor, not the original class, issue a warning.
|
|
60
|
+
if cls is not current_class_in_mro:
|
|
61
|
+
warnings.warn(
|
|
62
|
+
f"Prompt guide for '{original_class_name}' not found. "
|
|
63
|
+
f"Using guide from ancestor: '{current_class_name}_prompt_guide.txt'.",
|
|
64
|
+
UserWarning
|
|
65
|
+
)
|
|
66
|
+
return prompt_content
|
|
67
|
+
except FileNotFoundError:
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
warnings.warn(
|
|
72
|
+
f"Error attempting to read prompt guide for '{current_class_name}' "
|
|
73
|
+
f"from '{str(file_path_obj)}': {e}. Trying next in MRO.",
|
|
74
|
+
UserWarning
|
|
75
|
+
)
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
# If the loop completes, no prompt guide was found for the original class or any of its ancestors.
|
|
79
|
+
raise FileNotFoundError(
|
|
80
|
+
f"Prompt guide for '{original_class_name}' not found in the package asset. "
|
|
81
|
+
f"Is it a custom extractor?"
|
|
82
|
+
)
|
|
50
83
|
|
|
51
84
|
def _get_user_prompt(self, text_content:Union[str, Dict[str,str]]) -> str:
|
|
52
85
|
"""
|
|
@@ -138,7 +171,8 @@ class Extractor:
|
|
|
138
171
|
|
|
139
172
|
class FrameExtractor(Extractor):
|
|
140
173
|
from nltk.tokenize import RegexpTokenizer
|
|
141
|
-
def __init__(self, inference_engine:InferenceEngine,
|
|
174
|
+
def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
|
|
175
|
+
prompt_template:str, system_prompt:str=None, context_chunker:ContextChunker=None, **kwrs):
|
|
142
176
|
"""
|
|
143
177
|
This is the abstract class for frame extraction.
|
|
144
178
|
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
@@ -147,15 +181,25 @@ class FrameExtractor(Extractor):
|
|
|
147
181
|
----------
|
|
148
182
|
inference_engine : InferenceEngine
|
|
149
183
|
the LLM inferencing engine object. Must implements the chat() method.
|
|
184
|
+
unit_chunker : UnitChunker
|
|
185
|
+
the unit chunker object that determines how to chunk the document text into units.
|
|
150
186
|
prompt_template : str
|
|
151
187
|
prompt template with "{{<placeholder name>}}" placeholder.
|
|
152
188
|
system_prompt : str, Optional
|
|
153
189
|
system prompt.
|
|
190
|
+
context_chunker : ContextChunker
|
|
191
|
+
the context chunker object that determines how to get context for each unit.
|
|
154
192
|
"""
|
|
155
193
|
super().__init__(inference_engine=inference_engine,
|
|
156
194
|
prompt_template=prompt_template,
|
|
157
195
|
system_prompt=system_prompt,
|
|
158
196
|
**kwrs)
|
|
197
|
+
|
|
198
|
+
self.unit_chunker = unit_chunker
|
|
199
|
+
if context_chunker is None:
|
|
200
|
+
self.context_chunker = NoContextChunker()
|
|
201
|
+
else:
|
|
202
|
+
self.context_chunker = context_chunker
|
|
159
203
|
|
|
160
204
|
self.tokenizer = self.RegexpTokenizer(r'\w+|[^\w\s]')
|
|
161
205
|
|
|
@@ -338,32 +382,38 @@ class FrameExtractor(Extractor):
|
|
|
338
382
|
return NotImplemented
|
|
339
383
|
|
|
340
384
|
|
|
341
|
-
class
|
|
342
|
-
def __init__(self, inference_engine:InferenceEngine,
|
|
385
|
+
class DirectFrameExtractor(FrameExtractor):
|
|
386
|
+
def __init__(self, inference_engine:InferenceEngine, unit_chunker:UnitChunker,
|
|
387
|
+
prompt_template:str, system_prompt:str=None, context_chunker:ContextChunker=None, **kwrs):
|
|
343
388
|
"""
|
|
344
|
-
This class
|
|
345
|
-
Input system prompt (optional), prompt template (with instruction, few-shot examples)
|
|
346
|
-
and specify a LLM.
|
|
389
|
+
This class is for general unit-context frame extraction.
|
|
390
|
+
Input LLM inference engine, system prompt (optional), prompt template (with instruction, few-shot examples).
|
|
347
391
|
|
|
348
392
|
Parameters:
|
|
349
393
|
----------
|
|
350
394
|
inference_engine : InferenceEngine
|
|
351
395
|
the LLM inferencing engine object. Must implements the chat() method.
|
|
396
|
+
unit_chunker : UnitChunker
|
|
397
|
+
the unit chunker object that determines how to chunk the document text into units.
|
|
352
398
|
prompt_template : str
|
|
353
399
|
prompt template with "{{<placeholder name>}}" placeholder.
|
|
354
400
|
system_prompt : str, Optional
|
|
355
401
|
system prompt.
|
|
402
|
+
context_chunker : ContextChunker
|
|
403
|
+
the context chunker object that determines how to get context for each unit.
|
|
356
404
|
"""
|
|
357
|
-
super().__init__(inference_engine=inference_engine,
|
|
358
|
-
|
|
359
|
-
|
|
405
|
+
super().__init__(inference_engine=inference_engine,
|
|
406
|
+
unit_chunker=unit_chunker,
|
|
407
|
+
prompt_template=prompt_template,
|
|
408
|
+
system_prompt=system_prompt,
|
|
409
|
+
context_chunker=context_chunker,
|
|
360
410
|
**kwrs)
|
|
361
|
-
|
|
411
|
+
|
|
362
412
|
|
|
363
413
|
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048,
|
|
364
|
-
temperature:float=0.0,
|
|
414
|
+
document_key:str=None, temperature:float=0.0, verbose:bool=False, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnitResult]:
|
|
365
415
|
"""
|
|
366
|
-
This method inputs a text and outputs a
|
|
416
|
+
This method inputs a text and outputs a list of outputs per unit.
|
|
367
417
|
|
|
368
418
|
Parameters:
|
|
369
419
|
----------
|
|
@@ -371,44 +421,349 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
371
421
|
the input text content to put in prompt template.
|
|
372
422
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
373
423
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
374
|
-
max_new_tokens :
|
|
375
|
-
the max number of new tokens LLM
|
|
424
|
+
max_new_tokens : int, Optional
|
|
425
|
+
the max number of new tokens LLM should generate.
|
|
426
|
+
document_key : str, Optional
|
|
427
|
+
specify the key in text_content where document text is.
|
|
428
|
+
If text_content is str, this parameter will be ignored.
|
|
376
429
|
temperature : float, Optional
|
|
377
|
-
the temperature for token sampling.
|
|
378
|
-
|
|
430
|
+
the temperature for token sampling.
|
|
431
|
+
verbose : bool, Optional
|
|
379
432
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
380
433
|
return_messages_log : bool, Optional
|
|
381
434
|
if True, a list of messages will be returned.
|
|
382
435
|
|
|
383
|
-
Return :
|
|
384
|
-
the output from LLM.
|
|
436
|
+
Return : List[FrameExtractionUnitResult]
|
|
437
|
+
the output from LLM for each unit. Contains the start, end, text, and generated text.
|
|
438
|
+
"""
|
|
439
|
+
# define output
|
|
440
|
+
output = []
|
|
441
|
+
# unit chunking
|
|
442
|
+
if isinstance(text_content, str):
|
|
443
|
+
doc_text = text_content
|
|
444
|
+
|
|
445
|
+
elif isinstance(text_content, dict):
|
|
446
|
+
if document_key is None:
|
|
447
|
+
raise ValueError("document_key must be provided when text_content is dict.")
|
|
448
|
+
doc_text = text_content[document_key]
|
|
449
|
+
|
|
450
|
+
units = self.unit_chunker.chunk(doc_text)
|
|
451
|
+
# context chunker init
|
|
452
|
+
self.context_chunker.fit(doc_text, units)
|
|
453
|
+
# messages log
|
|
454
|
+
if return_messages_log:
|
|
455
|
+
messages_log = []
|
|
456
|
+
|
|
457
|
+
# generate unit by unit
|
|
458
|
+
for i, unit in enumerate(units):
|
|
459
|
+
# construct chat messages
|
|
460
|
+
messages = []
|
|
461
|
+
if self.system_prompt:
|
|
462
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
463
|
+
|
|
464
|
+
context = self.context_chunker.chunk(unit)
|
|
465
|
+
|
|
466
|
+
if context == "":
|
|
467
|
+
# no context, just place unit in user prompt
|
|
468
|
+
if isinstance(text_content, str):
|
|
469
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
|
|
470
|
+
else:
|
|
471
|
+
unit_content = text_content.copy()
|
|
472
|
+
unit_content[document_key] = unit.text
|
|
473
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
|
|
474
|
+
else:
|
|
475
|
+
# insert context to user prompt
|
|
476
|
+
if isinstance(text_content, str):
|
|
477
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
478
|
+
else:
|
|
479
|
+
context_content = text_content.copy()
|
|
480
|
+
context_content[document_key] = context
|
|
481
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
482
|
+
# simulate conversation where assistant confirms
|
|
483
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
|
|
484
|
+
# place unit of interest
|
|
485
|
+
messages.append({'role': 'user', 'content': unit.text})
|
|
486
|
+
|
|
487
|
+
if verbose:
|
|
488
|
+
print(f"\n\n{Fore.GREEN}Unit {i}:{Style.RESET_ALL}\n{unit.text}\n")
|
|
489
|
+
if context != "":
|
|
490
|
+
print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
|
|
491
|
+
|
|
492
|
+
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
493
|
+
|
|
494
|
+
response_stream = self.inference_engine.chat(
|
|
495
|
+
messages=messages,
|
|
496
|
+
max_new_tokens=max_new_tokens,
|
|
497
|
+
temperature=temperature,
|
|
498
|
+
stream=True,
|
|
499
|
+
**kwrs
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
gen_text = ""
|
|
503
|
+
for chunk in response_stream:
|
|
504
|
+
gen_text += chunk
|
|
505
|
+
print(chunk, end='', flush=True)
|
|
506
|
+
|
|
507
|
+
else:
|
|
508
|
+
gen_text = self.inference_engine.chat(
|
|
509
|
+
messages=messages,
|
|
510
|
+
max_new_tokens=max_new_tokens,
|
|
511
|
+
temperature=temperature,
|
|
512
|
+
stream=False,
|
|
513
|
+
**kwrs
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
if return_messages_log:
|
|
517
|
+
messages.append({"role": "assistant", "content": gen_text})
|
|
518
|
+
messages_log.append(messages)
|
|
519
|
+
|
|
520
|
+
# add to output
|
|
521
|
+
result = FrameExtractionUnitResult(
|
|
522
|
+
start=unit.start,
|
|
523
|
+
end=unit.end,
|
|
524
|
+
text=unit.text,
|
|
525
|
+
gen_text=gen_text)
|
|
526
|
+
output.append(result)
|
|
527
|
+
|
|
528
|
+
if return_messages_log:
|
|
529
|
+
return output, messages_log
|
|
530
|
+
|
|
531
|
+
return output
|
|
532
|
+
|
|
533
|
+
def stream(self, text_content: Union[str, Dict[str, str]], max_new_tokens: int = 2048, document_key: str = None,
|
|
534
|
+
temperature: float = 0.0, **kwrs) -> Generator[Dict[str, Any], None, List[FrameExtractionUnitResult]]:
|
|
535
|
+
"""
|
|
536
|
+
Streams LLM responses per unit with structured event types,
|
|
537
|
+
and returns collected data for post-processing.
|
|
538
|
+
|
|
539
|
+
Yields:
|
|
540
|
+
-------
|
|
541
|
+
Dict[str, Any]: (type, data)
|
|
542
|
+
- {"type": "info", "data": str_message}: General informational messages.
|
|
543
|
+
- {"type": "unit", "data": dict_unit_info}: Signals start of a new unit. dict_unit_info contains {'id', 'text', 'start', 'end'}
|
|
544
|
+
- {"type": "context", "data": str_context}: Context string for the current unit.
|
|
545
|
+
- {"type": "llm_chunk", "data": str_chunk}: A raw chunk from the LLM.
|
|
546
|
+
|
|
547
|
+
Returns:
|
|
548
|
+
--------
|
|
549
|
+
List[FrameExtractionUnitResult]:
|
|
550
|
+
A list of FrameExtractionUnitResult objects, each containing the
|
|
551
|
+
original unit details and the fully accumulated 'gen_text' from the LLM.
|
|
552
|
+
"""
|
|
553
|
+
collected_results: List[FrameExtractionUnitResult] = []
|
|
554
|
+
|
|
555
|
+
if isinstance(text_content, str):
|
|
556
|
+
doc_text = text_content
|
|
557
|
+
elif isinstance(text_content, dict):
|
|
558
|
+
if document_key is None:
|
|
559
|
+
raise ValueError("document_key must be provided when text_content is dict.")
|
|
560
|
+
if document_key not in text_content:
|
|
561
|
+
raise ValueError(f"document_key '{document_key}' not found in text_content.")
|
|
562
|
+
doc_text = text_content[document_key]
|
|
563
|
+
else:
|
|
564
|
+
raise TypeError("text_content must be a string or a dictionary.")
|
|
565
|
+
|
|
566
|
+
units: List[FrameExtractionUnit] = self.unit_chunker.chunk(doc_text)
|
|
567
|
+
self.context_chunker.fit(doc_text, units)
|
|
568
|
+
|
|
569
|
+
yield {"type": "info", "data": f"Starting LLM processing for {len(units)} units."}
|
|
570
|
+
|
|
571
|
+
for i, unit in enumerate(units):
|
|
572
|
+
unit_info_payload = {"id": i, "text": unit.text, "start": unit.start, "end": unit.end}
|
|
573
|
+
yield {"type": "unit", "data": unit_info_payload}
|
|
574
|
+
|
|
575
|
+
messages = []
|
|
576
|
+
if self.system_prompt:
|
|
577
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
578
|
+
|
|
579
|
+
context_str = self.context_chunker.chunk(unit)
|
|
580
|
+
|
|
581
|
+
# Construct prompt input based on whether text_content was str or dict
|
|
582
|
+
if context_str:
|
|
583
|
+
yield {"type": "context", "data": context_str}
|
|
584
|
+
prompt_input_for_context = context_str
|
|
585
|
+
if isinstance(text_content, dict):
|
|
586
|
+
context_content_dict = text_content.copy()
|
|
587
|
+
context_content_dict[document_key] = context_str
|
|
588
|
+
prompt_input_for_context = context_content_dict
|
|
589
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_context)})
|
|
590
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
|
|
591
|
+
messages.append({'role': 'user', 'content': unit.text})
|
|
592
|
+
else: # No context
|
|
593
|
+
prompt_input_for_unit = unit.text
|
|
594
|
+
if isinstance(text_content, dict):
|
|
595
|
+
unit_content_dict = text_content.copy()
|
|
596
|
+
unit_content_dict[document_key] = unit.text
|
|
597
|
+
prompt_input_for_unit = unit_content_dict
|
|
598
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(prompt_input_for_unit)})
|
|
599
|
+
|
|
600
|
+
current_gen_text = ""
|
|
601
|
+
|
|
602
|
+
response_stream = self.inference_engine.chat(
|
|
603
|
+
messages=messages,
|
|
604
|
+
max_new_tokens=max_new_tokens,
|
|
605
|
+
temperature=temperature,
|
|
606
|
+
stream=True,
|
|
607
|
+
**kwrs
|
|
608
|
+
)
|
|
609
|
+
for chunk in response_stream:
|
|
610
|
+
yield {"type": "llm_chunk", "data": chunk}
|
|
611
|
+
current_gen_text += chunk
|
|
612
|
+
|
|
613
|
+
# Store the result for this unit
|
|
614
|
+
result_for_unit = FrameExtractionUnitResult(
|
|
615
|
+
start=unit.start,
|
|
616
|
+
end=unit.end,
|
|
617
|
+
text=unit.text,
|
|
618
|
+
gen_text=current_gen_text
|
|
619
|
+
)
|
|
620
|
+
collected_results.append(result_for_unit)
|
|
621
|
+
|
|
622
|
+
yield {"type": "info", "data": "All units processed by LLM."}
|
|
623
|
+
return collected_results
|
|
624
|
+
|
|
625
|
+
async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, document_key:str=None, temperature:float=0.0,
|
|
626
|
+
concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnitResult]:
|
|
627
|
+
"""
|
|
628
|
+
This is the asynchronous version of the extract() method.
|
|
629
|
+
|
|
630
|
+
Parameters:
|
|
631
|
+
----------
|
|
632
|
+
text_content : Union[str, Dict[str,str]]
|
|
633
|
+
the input text content to put in prompt template.
|
|
634
|
+
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
635
|
+
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
636
|
+
max_new_tokens : int, Optional
|
|
637
|
+
the max number of new tokens LLM should generate.
|
|
638
|
+
document_key : str, Optional
|
|
639
|
+
specify the key in text_content where document text is.
|
|
640
|
+
If text_content is str, this parameter will be ignored.
|
|
641
|
+
temperature : float, Optional
|
|
642
|
+
the temperature for token sampling.
|
|
643
|
+
concurrent_batch_size : int, Optional
|
|
644
|
+
the batch size for concurrent processing.
|
|
645
|
+
return_messages_log : bool, Optional
|
|
646
|
+
if True, a list of messages will be returned.
|
|
647
|
+
|
|
648
|
+
Return : List[FrameExtractionUnitResult]
|
|
649
|
+
the output from LLM for each unit. Contains the start, end, text, and generated text.
|
|
385
650
|
"""
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
651
|
+
if isinstance(text_content, str):
|
|
652
|
+
doc_text = text_content
|
|
653
|
+
elif isinstance(text_content, dict):
|
|
654
|
+
if document_key is None:
|
|
655
|
+
raise ValueError("document_key must be provided when text_content is dict.")
|
|
656
|
+
if document_key not in text_content:
|
|
657
|
+
raise ValueError(f"document_key '{document_key}' not found in text_content dictionary.")
|
|
658
|
+
doc_text = text_content[document_key]
|
|
659
|
+
else:
|
|
660
|
+
raise TypeError("text_content must be a string or a dictionary.")
|
|
661
|
+
|
|
662
|
+
units = self.unit_chunker.chunk(doc_text)
|
|
663
|
+
|
|
664
|
+
# context chunker init
|
|
665
|
+
self.context_chunker.fit(doc_text, units)
|
|
666
|
+
|
|
667
|
+
# Prepare inputs for all units first
|
|
668
|
+
tasks_input = []
|
|
669
|
+
for i, unit in enumerate(units):
|
|
670
|
+
# construct chat messages
|
|
671
|
+
messages = []
|
|
672
|
+
if self.system_prompt:
|
|
673
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
674
|
+
|
|
675
|
+
context = self.context_chunker.chunk(unit)
|
|
676
|
+
|
|
677
|
+
if context == "":
|
|
678
|
+
# no context, just place unit in user prompt
|
|
679
|
+
if isinstance(text_content, str):
|
|
680
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
|
|
681
|
+
else:
|
|
682
|
+
unit_content = text_content.copy()
|
|
683
|
+
unit_content[document_key] = unit.text
|
|
684
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
|
|
685
|
+
else:
|
|
686
|
+
# insert context to user prompt
|
|
687
|
+
if isinstance(text_content, str):
|
|
688
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
689
|
+
else:
|
|
690
|
+
context_content = text_content.copy()
|
|
691
|
+
context_content[document_key] = context
|
|
692
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
693
|
+
# simulate conversation where assistant confirms
|
|
694
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
|
|
695
|
+
# place unit of interest
|
|
696
|
+
messages.append({'role': 'user', 'content': unit.text})
|
|
697
|
+
|
|
698
|
+
# Store unit and messages together for the task
|
|
699
|
+
tasks_input.append({"unit": unit, "messages": messages, "original_index": i})
|
|
389
700
|
|
|
390
|
-
|
|
391
|
-
|
|
701
|
+
# Process units concurrently with asyncio.Semaphore
|
|
702
|
+
semaphore = asyncio.Semaphore(concurrent_batch_size)
|
|
703
|
+
|
|
704
|
+
async def semaphore_helper(task_data: Dict, max_new_tokens: int, temperature: float, **kwrs):
|
|
705
|
+
unit = task_data["unit"]
|
|
706
|
+
messages = task_data["messages"]
|
|
707
|
+
original_index = task_data["original_index"]
|
|
708
|
+
|
|
709
|
+
async with semaphore:
|
|
710
|
+
gen_text = await self.inference_engine.chat_async(
|
|
392
711
|
messages=messages,
|
|
393
|
-
max_new_tokens=max_new_tokens,
|
|
712
|
+
max_new_tokens=max_new_tokens,
|
|
394
713
|
temperature=temperature,
|
|
395
|
-
stream=stream,
|
|
396
714
|
**kwrs
|
|
397
715
|
)
|
|
398
|
-
|
|
716
|
+
return {"original_index": original_index, "unit": unit, "gen_text": gen_text, "messages": messages}
|
|
717
|
+
|
|
718
|
+
# Create and gather tasks
|
|
719
|
+
tasks = []
|
|
720
|
+
for task_inp in tasks_input:
|
|
721
|
+
task = asyncio.create_task(semaphore_helper(
|
|
722
|
+
task_inp,
|
|
723
|
+
max_new_tokens=max_new_tokens,
|
|
724
|
+
temperature=temperature,
|
|
725
|
+
**kwrs
|
|
726
|
+
))
|
|
727
|
+
tasks.append(task)
|
|
728
|
+
|
|
729
|
+
results_raw = await asyncio.gather(*tasks)
|
|
730
|
+
|
|
731
|
+
# Sort results back into original order using the index stored
|
|
732
|
+
results_raw.sort(key=lambda x: x["original_index"])
|
|
733
|
+
|
|
734
|
+
# Restructure the results
|
|
735
|
+
output: List[FrameExtractionUnitResult] = []
|
|
736
|
+
messages_log: Optional[List[List[Dict[str, str]]]] = [] if return_messages_log else None
|
|
737
|
+
|
|
738
|
+
for result_data in results_raw:
|
|
739
|
+
unit = result_data["unit"]
|
|
740
|
+
gen_text = result_data["gen_text"]
|
|
741
|
+
|
|
742
|
+
# Create result object
|
|
743
|
+
result = FrameExtractionUnitResult(
|
|
744
|
+
start=unit.start,
|
|
745
|
+
end=unit.end,
|
|
746
|
+
text=unit.text,
|
|
747
|
+
gen_text=gen_text
|
|
748
|
+
)
|
|
749
|
+
output.append(result)
|
|
750
|
+
|
|
751
|
+
# Append to messages log if requested
|
|
752
|
+
if return_messages_log:
|
|
753
|
+
final_messages = result_data["messages"] + [{"role": "assistant", "content": gen_text}]
|
|
754
|
+
messages_log.append(final_messages)
|
|
755
|
+
|
|
399
756
|
if return_messages_log:
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
return
|
|
403
|
-
|
|
404
|
-
return response
|
|
405
|
-
|
|
757
|
+
return output, messages_log
|
|
758
|
+
else:
|
|
759
|
+
return output
|
|
406
760
|
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
761
|
+
|
|
762
|
+
def extract_frames(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
763
|
+
document_key:str=None, temperature:float=0.0, verbose:bool=False,
|
|
764
|
+
concurrent:bool=False, concurrent_batch_size:int=32,
|
|
765
|
+
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
766
|
+
allow_overlap_entities:bool=False, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
412
767
|
"""
|
|
413
768
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
414
769
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -419,17 +774,19 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
419
774
|
the input text content to put in prompt template.
|
|
420
775
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
421
776
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
422
|
-
entity_key : str
|
|
423
|
-
the key (in ouptut JSON) for entity text. Any extraction that does not include entity key will be dropped.
|
|
424
777
|
max_new_tokens : str, Optional
|
|
425
778
|
the max number of new tokens LLM should generate.
|
|
426
|
-
temperature : float, Optional
|
|
427
|
-
the temperature for token sampling.
|
|
428
779
|
document_key : str, Optional
|
|
429
780
|
specify the key in text_content where document text is.
|
|
430
781
|
If text_content is str, this parameter will be ignored.
|
|
431
|
-
|
|
432
|
-
|
|
782
|
+
temperature : float, Optional
|
|
783
|
+
the temperature for token sampling.
|
|
784
|
+
verbose : bool, Optional
|
|
785
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
786
|
+
concurrent : bool, Optional
|
|
787
|
+
if True, the sentences will be extracted in concurrent.
|
|
788
|
+
concurrent_batch_size : int, Optional
|
|
789
|
+
the number of sentences to process in concurrent. Only used when `concurrent` is True.
|
|
433
790
|
case_sensitive : bool, Optional
|
|
434
791
|
if True, entity text matching will be case-sensitive.
|
|
435
792
|
fuzzy_match : bool, Optional
|
|
@@ -448,58 +805,74 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
448
805
|
Return : str
|
|
449
806
|
a list of frames.
|
|
450
807
|
"""
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
808
|
+
ENTITY_KEY = "entity_text"
|
|
809
|
+
if concurrent:
|
|
810
|
+
if verbose:
|
|
811
|
+
warnings.warn("verbose=True is not supported in concurrent mode.", RuntimeWarning)
|
|
812
|
+
|
|
813
|
+
nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
|
|
814
|
+
extraction_results = asyncio.run(self.extract_async(text_content=text_content,
|
|
815
|
+
max_new_tokens=max_new_tokens,
|
|
816
|
+
document_key=document_key,
|
|
817
|
+
temperature=temperature,
|
|
818
|
+
concurrent_batch_size=concurrent_batch_size,
|
|
819
|
+
return_messages_log=return_messages_log,
|
|
820
|
+
**kwrs)
|
|
821
|
+
)
|
|
822
|
+
else:
|
|
823
|
+
extraction_results = self.extract(text_content=text_content,
|
|
824
|
+
max_new_tokens=max_new_tokens,
|
|
825
|
+
document_key=document_key,
|
|
826
|
+
temperature=temperature,
|
|
827
|
+
verbose=verbose,
|
|
828
|
+
return_messages_log=return_messages_log,
|
|
829
|
+
**kwrs)
|
|
830
|
+
|
|
831
|
+
llm_output_results, messages_log = extraction_results if return_messages_log else (extraction_results, None)
|
|
457
832
|
|
|
458
833
|
frame_list = []
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
frame_list.append(frame)
|
|
834
|
+
for res in llm_output_results:
|
|
835
|
+
entity_json = []
|
|
836
|
+
for entity in self._extract_json(gen_text=res.gen_text):
|
|
837
|
+
if ENTITY_KEY in entity:
|
|
838
|
+
entity_json.append(entity)
|
|
839
|
+
else:
|
|
840
|
+
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{ENTITY_KEY}"). This frame will be dropped.', RuntimeWarning)
|
|
841
|
+
|
|
842
|
+
spans = self._find_entity_spans(text=res.text,
|
|
843
|
+
entities=[e[ENTITY_KEY] for e in entity_json],
|
|
844
|
+
case_sensitive=case_sensitive,
|
|
845
|
+
fuzzy_match=fuzzy_match,
|
|
846
|
+
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
847
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff,
|
|
848
|
+
allow_overlap_entities=allow_overlap_entities)
|
|
849
|
+
for ent, span in zip(entity_json, spans):
|
|
850
|
+
if span is not None:
|
|
851
|
+
start, end = span
|
|
852
|
+
entity_text = res.text[start:end]
|
|
853
|
+
start += res.start
|
|
854
|
+
end += res.start
|
|
855
|
+
attr = {}
|
|
856
|
+
if "attr" in ent and ent["attr"] is not None:
|
|
857
|
+
attr = ent["attr"]
|
|
858
|
+
|
|
859
|
+
frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
|
|
860
|
+
start=start,
|
|
861
|
+
end=end,
|
|
862
|
+
entity_text=entity_text,
|
|
863
|
+
attr=attr)
|
|
864
|
+
frame_list.append(frame)
|
|
491
865
|
|
|
492
866
|
if return_messages_log:
|
|
493
867
|
return frame_list, messages_log
|
|
494
|
-
|
|
495
868
|
return frame_list
|
|
496
869
|
|
|
497
870
|
|
|
498
|
-
class ReviewFrameExtractor(
|
|
499
|
-
def __init__(self,
|
|
500
|
-
review_mode:str, review_prompt:str=None,system_prompt:str=None, **kwrs):
|
|
871
|
+
class ReviewFrameExtractor(DirectFrameExtractor):
|
|
872
|
+
def __init__(self, unit_chunker:UnitChunker, context_chunker:ContextChunker,
|
|
873
|
+
inference_engine:InferenceEngine, prompt_template:str, review_mode:str, review_prompt:str=None, system_prompt:str=None, **kwrs):
|
|
501
874
|
"""
|
|
502
|
-
This class add a review step after the
|
|
875
|
+
This class add a review step after the DirectFrameExtractor.
|
|
503
876
|
The Review process asks LLM to review its output and:
|
|
504
877
|
1. add more frames while keep current. This is efficient for boosting recall.
|
|
505
878
|
2. or, regenerate frames (add new and delete existing).
|
|
@@ -507,6 +880,10 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
507
880
|
|
|
508
881
|
Parameters:
|
|
509
882
|
----------
|
|
883
|
+
unit_chunker : UnitChunker
|
|
884
|
+
the unit chunker object that determines how to chunk the document text into units.
|
|
885
|
+
context_chunker : ContextChunker
|
|
886
|
+
the context chunker object that determines how to get context for each unit.
|
|
510
887
|
inference_engine : InferenceEngine
|
|
511
888
|
the LLM inferencing engine object. Must implements the chat() method.
|
|
512
889
|
prompt_template : str
|
|
@@ -520,27 +897,52 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
520
897
|
system_prompt : str, Optional
|
|
521
898
|
system prompt.
|
|
522
899
|
"""
|
|
523
|
-
super().__init__(inference_engine=inference_engine,
|
|
524
|
-
|
|
900
|
+
super().__init__(inference_engine=inference_engine,
|
|
901
|
+
unit_chunker=unit_chunker,
|
|
902
|
+
prompt_template=prompt_template,
|
|
903
|
+
system_prompt=system_prompt,
|
|
904
|
+
context_chunker=context_chunker,
|
|
905
|
+
**kwrs)
|
|
906
|
+
# check review mode
|
|
525
907
|
if review_mode not in {"addition", "revision"}:
|
|
526
908
|
raise ValueError('review_mode must be one of {"addition", "revision"}.')
|
|
527
909
|
self.review_mode = review_mode
|
|
528
|
-
|
|
910
|
+
# assign review prompt
|
|
529
911
|
if review_prompt:
|
|
530
912
|
self.review_prompt = review_prompt
|
|
531
913
|
else:
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
914
|
+
self.review_prompt = None
|
|
915
|
+
original_class_name = self.__class__.__name__
|
|
916
|
+
|
|
917
|
+
current_class_name = original_class_name
|
|
918
|
+
for current_class_in_mro in self.__class__.__mro__:
|
|
919
|
+
if current_class_in_mro is object:
|
|
920
|
+
continue
|
|
921
|
+
|
|
922
|
+
current_class_name = current_class_in_mro.__name__
|
|
923
|
+
try:
|
|
924
|
+
file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
|
|
925
|
+
joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
|
|
926
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
927
|
+
self.review_prompt = f.read()
|
|
928
|
+
except FileNotFoundError:
|
|
929
|
+
pass
|
|
930
|
+
|
|
931
|
+
except Exception as e:
|
|
932
|
+
warnings.warn(
|
|
933
|
+
f"Error attempting to read default review prompt for '{current_class_name}' "
|
|
934
|
+
f"from '{str(file_path)}': {e}. Trying next in MRO.",
|
|
935
|
+
UserWarning
|
|
936
|
+
)
|
|
937
|
+
continue
|
|
938
|
+
|
|
939
|
+
if self.review_prompt is None:
|
|
940
|
+
raise ValueError(f"Cannot find review prompt for {self.__class__.__name__} in the package. Please provide a review_prompt.")
|
|
539
941
|
|
|
540
|
-
def extract(self, text_content:Union[str, Dict[str,str]],
|
|
541
|
-
|
|
942
|
+
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, document_key:str=None,
|
|
943
|
+
temperature:float=0.0, verbose:bool=False, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnitResult]:
|
|
542
944
|
"""
|
|
543
|
-
This method inputs a text and outputs a
|
|
945
|
+
This method inputs a text and outputs a list of outputs per unit.
|
|
544
946
|
|
|
545
947
|
Parameters:
|
|
546
948
|
----------
|
|
@@ -548,515 +950,161 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
548
950
|
the input text content to put in prompt template.
|
|
549
951
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
550
952
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
551
|
-
max_new_tokens :
|
|
552
|
-
the max number of new tokens LLM
|
|
953
|
+
max_new_tokens : int, Optional
|
|
954
|
+
the max number of new tokens LLM should generate.
|
|
955
|
+
document_key : str, Optional
|
|
956
|
+
specify the key in text_content where document text is.
|
|
957
|
+
If text_content is str, this parameter will be ignored.
|
|
553
958
|
temperature : float, Optional
|
|
554
|
-
the temperature for token sampling.
|
|
555
|
-
|
|
959
|
+
the temperature for token sampling.
|
|
960
|
+
verbose : bool, Optional
|
|
556
961
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
557
962
|
return_messages_log : bool, Optional
|
|
558
963
|
if True, a list of messages will be returned.
|
|
559
964
|
|
|
560
|
-
Return :
|
|
561
|
-
the output from LLM.
|
|
965
|
+
Return : List[FrameExtractionUnitResult]
|
|
966
|
+
the output from LLM for each unit. Contains the start, end, text, and generated text.
|
|
562
967
|
"""
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
# Initial output
|
|
569
|
-
if stream:
|
|
570
|
-
print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
|
|
571
|
-
|
|
572
|
-
initial = self.inference_engine.chat(
|
|
573
|
-
messages=messages,
|
|
574
|
-
max_new_tokens=max_new_tokens,
|
|
575
|
-
temperature=temperature,
|
|
576
|
-
stream=stream,
|
|
577
|
-
**kwrs
|
|
578
|
-
)
|
|
579
|
-
|
|
580
|
-
# Review
|
|
581
|
-
messages.append({'role': 'assistant', 'content': initial})
|
|
582
|
-
messages.append({'role': 'user', 'content': self.review_prompt})
|
|
583
|
-
|
|
584
|
-
if stream:
|
|
585
|
-
print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
|
|
586
|
-
review = self.inference_engine.chat(
|
|
587
|
-
messages=messages,
|
|
588
|
-
max_new_tokens=max_new_tokens,
|
|
589
|
-
temperature=temperature,
|
|
590
|
-
stream=stream,
|
|
591
|
-
**kwrs
|
|
592
|
-
)
|
|
593
|
-
|
|
594
|
-
# Output
|
|
595
|
-
output_text = ""
|
|
596
|
-
if self.review_mode == "revision":
|
|
597
|
-
output_text = review
|
|
598
|
-
elif self.review_mode == "addition":
|
|
599
|
-
output_text = initial + '\n' + review
|
|
968
|
+
# define output
|
|
969
|
+
output = []
|
|
970
|
+
# unit chunking
|
|
971
|
+
if isinstance(text_content, str):
|
|
972
|
+
doc_text = text_content
|
|
600
973
|
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
974
|
+
elif isinstance(text_content, dict):
|
|
975
|
+
if document_key is None:
|
|
976
|
+
raise ValueError("document_key must be provided when text_content is dict.")
|
|
977
|
+
doc_text = text_content[document_key]
|
|
605
978
|
|
|
606
|
-
|
|
607
|
-
|
|
979
|
+
units = self.unit_chunker.chunk(doc_text)
|
|
980
|
+
# context chunker init
|
|
981
|
+
self.context_chunker.fit(doc_text, units)
|
|
982
|
+
# messages log
|
|
983
|
+
if return_messages_log:
|
|
984
|
+
messages_log = []
|
|
608
985
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
context_sentences:Union[str, int]="all", **kwrs):
|
|
613
|
-
"""
|
|
614
|
-
This class performs sentence-by-sentence information extraction.
|
|
615
|
-
The process is as follows:
|
|
616
|
-
1. system prompt (optional)
|
|
617
|
-
2. user prompt with instructions (schema, background, full text, few-shot example...)
|
|
618
|
-
3. feed a sentence (start with first sentence)
|
|
619
|
-
4. LLM extract entities and attributes from the sentence
|
|
620
|
-
5. repeat #3 and #4
|
|
621
|
-
|
|
622
|
-
Input system prompt (optional), prompt template (with user instructions),
|
|
623
|
-
and specify a LLM.
|
|
624
|
-
|
|
625
|
-
Parameters:
|
|
626
|
-
----------
|
|
627
|
-
inference_engine : InferenceEngine
|
|
628
|
-
the LLM inferencing engine object. Must implements the chat() method.
|
|
629
|
-
prompt_template : str
|
|
630
|
-
prompt template with "{{<placeholder name>}}" placeholder.
|
|
631
|
-
system_prompt : str, Optional
|
|
632
|
-
system prompt.
|
|
633
|
-
context_sentences : Union[str, int], Optional
|
|
634
|
-
number of sentences before and after the given sentence to provide additional context.
|
|
635
|
-
if "all", the full text will be provided in the prompt as context.
|
|
636
|
-
if 0, no additional context will be provided.
|
|
637
|
-
This is good for tasks that does not require context beyond the given sentence.
|
|
638
|
-
if > 0, the number of sentences before and after the given sentence to provide as context.
|
|
639
|
-
This is good for tasks that require context beyond the given sentence.
|
|
640
|
-
"""
|
|
641
|
-
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
642
|
-
system_prompt=system_prompt, **kwrs)
|
|
643
|
-
|
|
644
|
-
if not isinstance(context_sentences, int) and context_sentences != "all":
|
|
645
|
-
raise ValueError('context_sentences must be an integer (>= 0) or "all".')
|
|
646
|
-
|
|
647
|
-
if isinstance(context_sentences, int) and context_sentences < 0:
|
|
648
|
-
raise ValueError("context_sentences must be a positive integer.")
|
|
649
|
-
|
|
650
|
-
self.context_sentences =context_sentences
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
def _get_sentences(self, text:str) -> List[Dict[str,str]]:
|
|
654
|
-
"""
|
|
655
|
-
This method sentence tokenize the input text into a list of sentences
|
|
656
|
-
as dict of {start, end, sentence_text}
|
|
657
|
-
|
|
658
|
-
Parameters:
|
|
659
|
-
----------
|
|
660
|
-
text : str
|
|
661
|
-
text to sentence tokenize.
|
|
662
|
-
|
|
663
|
-
Returns : List[Dict[str,str]]
|
|
664
|
-
a list of sentences as dict with keys: {"sentence_text", "start", "end"}.
|
|
665
|
-
"""
|
|
666
|
-
sentences = []
|
|
667
|
-
for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
|
|
668
|
-
sentences.append({"sentence_text": text[start:end],
|
|
669
|
-
"start": start,
|
|
670
|
-
"end": end})
|
|
671
|
-
return sentences
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
def _get_context_sentences(self, text_content, i:int, sentences:List[Dict[str, str]], document_key:str=None) -> str:
|
|
675
|
-
"""
|
|
676
|
-
This function returns the context sentences for the current sentence of interest (i).
|
|
677
|
-
"""
|
|
678
|
-
if self.context_sentences == "all":
|
|
679
|
-
context = text_content if isinstance(text_content, str) else text_content[document_key]
|
|
680
|
-
elif self.context_sentences == 0:
|
|
681
|
-
context = ""
|
|
682
|
-
else:
|
|
683
|
-
start = max(0, i - self.context_sentences)
|
|
684
|
-
end = min(i + 1 + self.context_sentences, len(sentences))
|
|
685
|
-
context = " ".join([s['sentence_text'] for s in sentences[start:end]])
|
|
686
|
-
return context
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
690
|
-
document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
691
|
-
"""
|
|
692
|
-
This method inputs a text and outputs a list of outputs per sentence.
|
|
693
|
-
|
|
694
|
-
Parameters:
|
|
695
|
-
----------
|
|
696
|
-
text_content : Union[str, Dict[str,str]]
|
|
697
|
-
the input text content to put in prompt template.
|
|
698
|
-
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
699
|
-
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
700
|
-
max_new_tokens : str, Optional
|
|
701
|
-
the max number of new tokens LLM should generate.
|
|
702
|
-
document_key : str, Optional
|
|
703
|
-
specify the key in text_content where document text is.
|
|
704
|
-
If text_content is str, this parameter will be ignored.
|
|
705
|
-
temperature : float, Optional
|
|
706
|
-
the temperature for token sampling.
|
|
707
|
-
stream : bool, Optional
|
|
708
|
-
if True, LLM generated text will be printed in terminal in real-time.
|
|
709
|
-
return_messages_log : bool, Optional
|
|
710
|
-
if True, a list of messages will be returned.
|
|
711
|
-
|
|
712
|
-
Return : str
|
|
713
|
-
the output from LLM. Need post-processing.
|
|
714
|
-
"""
|
|
715
|
-
# define output
|
|
716
|
-
output = []
|
|
717
|
-
# sentence tokenization
|
|
718
|
-
if isinstance(text_content, str):
|
|
719
|
-
sentences = self._get_sentences(text_content)
|
|
720
|
-
elif isinstance(text_content, dict):
|
|
721
|
-
if document_key is None:
|
|
722
|
-
raise ValueError("document_key must be provided when text_content is dict.")
|
|
723
|
-
sentences = self._get_sentences(text_content[document_key])
|
|
724
|
-
|
|
725
|
-
if return_messages_log:
|
|
726
|
-
messages_log = []
|
|
727
|
-
|
|
728
|
-
# generate sentence by sentence
|
|
729
|
-
for i, sent in enumerate(sentences):
|
|
986
|
+
# generate unit by unit
|
|
987
|
+
for i, unit in enumerate(units):
|
|
988
|
+
# <--- Initial generation step --->
|
|
730
989
|
# construct chat messages
|
|
731
990
|
messages = []
|
|
732
991
|
if self.system_prompt:
|
|
733
992
|
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
734
993
|
|
|
735
|
-
context = self.
|
|
994
|
+
context = self.context_chunker.chunk(unit)
|
|
736
995
|
|
|
737
|
-
if
|
|
738
|
-
# no context, just place
|
|
996
|
+
if context == "":
|
|
997
|
+
# no context, just place unit in user prompt
|
|
739
998
|
if isinstance(text_content, str):
|
|
740
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(
|
|
999
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
|
|
741
1000
|
else:
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(
|
|
1001
|
+
unit_content = text_content.copy()
|
|
1002
|
+
unit_content[document_key] = unit.text
|
|
1003
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
|
|
745
1004
|
else:
|
|
746
|
-
# insert context
|
|
1005
|
+
# insert context to user prompt
|
|
747
1006
|
if isinstance(text_content, str):
|
|
748
1007
|
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
749
1008
|
else:
|
|
750
1009
|
context_content = text_content.copy()
|
|
751
1010
|
context_content[document_key] = context
|
|
752
1011
|
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
753
|
-
# simulate conversation
|
|
754
|
-
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
755
|
-
# place
|
|
756
|
-
messages.append({'role': 'user', 'content':
|
|
757
|
-
|
|
758
|
-
if
|
|
759
|
-
print(f"\n\n{Fore.GREEN}
|
|
760
|
-
if
|
|
1012
|
+
# simulate conversation where assistant confirms
|
|
1013
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
|
|
1014
|
+
# place unit of interest
|
|
1015
|
+
messages.append({'role': 'user', 'content': unit.text})
|
|
1016
|
+
|
|
1017
|
+
if verbose:
|
|
1018
|
+
print(f"\n\n{Fore.GREEN}Unit {i}:{Style.RESET_ALL}\n{unit.text}\n")
|
|
1019
|
+
if context != "":
|
|
761
1020
|
print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
|
|
762
1021
|
|
|
763
1022
|
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
764
1023
|
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
1024
|
+
response_stream = self.inference_engine.chat(
|
|
1025
|
+
messages=messages,
|
|
1026
|
+
max_new_tokens=max_new_tokens,
|
|
1027
|
+
temperature=temperature,
|
|
1028
|
+
stream=True,
|
|
1029
|
+
**kwrs
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
initial = ""
|
|
1033
|
+
for chunk in response_stream:
|
|
1034
|
+
initial += chunk
|
|
1035
|
+
print(chunk, end='', flush=True)
|
|
772
1036
|
|
|
1037
|
+
else:
|
|
1038
|
+
initial = self.inference_engine.chat(
|
|
1039
|
+
messages=messages,
|
|
1040
|
+
max_new_tokens=max_new_tokens,
|
|
1041
|
+
temperature=temperature,
|
|
1042
|
+
stream=False,
|
|
1043
|
+
**kwrs
|
|
1044
|
+
)
|
|
1045
|
+
|
|
773
1046
|
if return_messages_log:
|
|
774
|
-
messages.append({"role": "assistant", "content":
|
|
1047
|
+
messages.append({"role": "assistant", "content": initial})
|
|
775
1048
|
messages_log.append(messages)
|
|
776
1049
|
|
|
777
|
-
#
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
'sentence_text': sent['sentence_text'],
|
|
781
|
-
'gen_text': gen_text})
|
|
782
|
-
|
|
783
|
-
if return_messages_log:
|
|
784
|
-
return output, messages_log
|
|
785
|
-
|
|
786
|
-
return output
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
790
|
-
document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32,
|
|
791
|
-
return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
792
|
-
"""
|
|
793
|
-
The asynchronous version of the extract() method.
|
|
794
|
-
|
|
795
|
-
Parameters:
|
|
796
|
-
----------
|
|
797
|
-
text_content : Union[str, Dict[str,str]]
|
|
798
|
-
the input text content to put in prompt template.
|
|
799
|
-
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
800
|
-
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
801
|
-
max_new_tokens : str, Optional
|
|
802
|
-
the max number of new tokens LLM should generate.
|
|
803
|
-
document_key : str, Optional
|
|
804
|
-
specify the key in text_content where document text is.
|
|
805
|
-
If text_content is str, this parameter will be ignored.
|
|
806
|
-
temperature : float, Optional
|
|
807
|
-
the temperature for token sampling.
|
|
808
|
-
concurrent_batch_size : int, Optional
|
|
809
|
-
the number of sentences to process in concurrent.
|
|
810
|
-
return_messages_log : bool, Optional
|
|
811
|
-
if True, a list of messages will be returned.
|
|
812
|
-
|
|
813
|
-
Return : str
|
|
814
|
-
the output from LLM. Need post-processing.
|
|
815
|
-
"""
|
|
816
|
-
# Check if self.inference_engine.chat_async() is implemented
|
|
817
|
-
if not hasattr(self.inference_engine, 'chat_async'):
|
|
818
|
-
raise NotImplementedError(f"{self.inference_engine.__class__.__name__} does not have chat_async() method.")
|
|
819
|
-
|
|
820
|
-
# define output
|
|
821
|
-
output = []
|
|
822
|
-
# sentence tokenization
|
|
823
|
-
if isinstance(text_content, str):
|
|
824
|
-
sentences = self._get_sentences(text_content)
|
|
825
|
-
elif isinstance(text_content, dict):
|
|
826
|
-
if document_key is None:
|
|
827
|
-
raise ValueError("document_key must be provided when text_content is dict.")
|
|
828
|
-
sentences = self._get_sentences(text_content[document_key])
|
|
829
|
-
|
|
830
|
-
if return_messages_log:
|
|
831
|
-
messages_log = []
|
|
1050
|
+
# <--- Review step --->
|
|
1051
|
+
if verbose:
|
|
1052
|
+
print(f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}")
|
|
832
1053
|
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
tasks = []
|
|
836
|
-
batch = sentences[i:i + concurrent_batch_size]
|
|
837
|
-
batch_messages = []
|
|
838
|
-
for j, sent in enumerate(batch):
|
|
839
|
-
# construct chat messages
|
|
840
|
-
messages = []
|
|
841
|
-
if self.system_prompt:
|
|
842
|
-
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
1054
|
+
messages.append({'role': 'assistant', 'content': initial})
|
|
1055
|
+
messages.append({'role': 'user', 'content': self.review_prompt})
|
|
843
1056
|
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
sentence_content[document_key] = sent['sentence_text']
|
|
853
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
|
|
854
|
-
else:
|
|
855
|
-
# insert context
|
|
856
|
-
if isinstance(text_content, str):
|
|
857
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
858
|
-
else:
|
|
859
|
-
context_content = text_content.copy()
|
|
860
|
-
context_content[document_key] = context
|
|
861
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
862
|
-
# simulate conversation
|
|
863
|
-
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
864
|
-
# place sentence of interest
|
|
865
|
-
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
1057
|
+
if verbose:
|
|
1058
|
+
response_stream = self.inference_engine.chat(
|
|
1059
|
+
messages=messages,
|
|
1060
|
+
max_new_tokens=max_new_tokens,
|
|
1061
|
+
temperature=temperature,
|
|
1062
|
+
stream=True,
|
|
1063
|
+
**kwrs
|
|
1064
|
+
)
|
|
866
1065
|
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
1066
|
+
review = ""
|
|
1067
|
+
for chunk in response_stream:
|
|
1068
|
+
review += chunk
|
|
1069
|
+
print(chunk, end='', flush=True)
|
|
1070
|
+
|
|
1071
|
+
else:
|
|
1072
|
+
review = self.inference_engine.chat(
|
|
870
1073
|
messages=messages,
|
|
871
1074
|
max_new_tokens=max_new_tokens,
|
|
872
1075
|
temperature=temperature,
|
|
1076
|
+
stream=False,
|
|
873
1077
|
**kwrs
|
|
874
1078
|
)
|
|
875
|
-
)
|
|
876
|
-
tasks.append(task)
|
|
877
|
-
batch_messages.append(messages)
|
|
878
1079
|
|
|
879
|
-
#
|
|
880
|
-
|
|
1080
|
+
# Output
|
|
1081
|
+
if self.review_mode == "revision":
|
|
1082
|
+
gen_text = review
|
|
1083
|
+
elif self.review_mode == "addition":
|
|
1084
|
+
gen_text = initial + '\n' + review
|
|
881
1085
|
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
messages.append({"role": "assistant", "content": gen_text})
|
|
886
|
-
messages_log.append(messages)
|
|
1086
|
+
if return_messages_log:
|
|
1087
|
+
messages.append({"role": "assistant", "content": review})
|
|
1088
|
+
messages_log.append(messages)
|
|
887
1089
|
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
1090
|
+
# add to output
|
|
1091
|
+
result = FrameExtractionUnitResult(
|
|
1092
|
+
start=unit.start,
|
|
1093
|
+
end=unit.end,
|
|
1094
|
+
text=unit.text,
|
|
1095
|
+
gen_text=gen_text)
|
|
1096
|
+
output.append(result)
|
|
1097
|
+
|
|
893
1098
|
if return_messages_log:
|
|
894
1099
|
return output, messages_log
|
|
895
1100
|
|
|
896
1101
|
return output
|
|
897
|
-
|
|
898
1102
|
|
|
899
|
-
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
|
|
900
|
-
document_key:str=None, temperature:float=0.0, stream:bool=False,
|
|
901
|
-
concurrent:bool=False, concurrent_batch_size:int=32,
|
|
902
|
-
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
903
|
-
allow_overlap_entities:bool=False, return_messages_log:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
904
|
-
"""
|
|
905
|
-
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
906
|
-
It use the extract() method and post-process outputs into frames.
|
|
907
1103
|
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
text_content : Union[str, Dict[str,str]]
|
|
911
|
-
the input text content to put in prompt template.
|
|
912
|
-
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
913
|
-
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
914
|
-
entity_key : str
|
|
915
|
-
the key (in ouptut JSON) for entity text.
|
|
916
|
-
max_new_tokens : str, Optional
|
|
917
|
-
the max number of new tokens LLM should generate.
|
|
918
|
-
document_key : str, Optional
|
|
919
|
-
specify the key in text_content where document text is.
|
|
920
|
-
If text_content is str, this parameter will be ignored.
|
|
921
|
-
temperature : float, Optional
|
|
922
|
-
the temperature for token sampling.
|
|
923
|
-
stream : bool, Optional
|
|
924
|
-
if True, LLM generated text will be printed in terminal in real-time.
|
|
925
|
-
concurrent : bool, Optional
|
|
926
|
-
if True, the sentences will be extracted in concurrent.
|
|
927
|
-
concurrent_batch_size : int, Optional
|
|
928
|
-
the number of sentences to process in concurrent. Only used when `concurrent` is True.
|
|
929
|
-
case_sensitive : bool, Optional
|
|
930
|
-
if True, entity text matching will be case-sensitive.
|
|
931
|
-
fuzzy_match : bool, Optional
|
|
932
|
-
if True, fuzzy matching will be applied to find entity text.
|
|
933
|
-
fuzzy_buffer_size : float, Optional
|
|
934
|
-
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
935
|
-
fuzzy_score_cutoff : float, Optional
|
|
936
|
-
the Jaccard score cutoff for fuzzy matching.
|
|
937
|
-
Matched entity text must have a score higher than this value or a None will be returned.
|
|
938
|
-
allow_overlap_entities : bool, Optional
|
|
939
|
-
if True, entities can overlap in the text.
|
|
940
|
-
Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
|
|
941
|
-
return_messages_log : bool, Optional
|
|
942
|
-
if True, a list of messages will be returned.
|
|
943
|
-
|
|
944
|
-
Return : str
|
|
945
|
-
a list of frames.
|
|
1104
|
+
def stream(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048,
|
|
1105
|
+
document_key:str=None, temperature:float=0.0, **kwrs) -> Generator[str, None, None]:
|
|
946
1106
|
"""
|
|
947
|
-
|
|
948
|
-
if stream:
|
|
949
|
-
warnings.warn("stream=True is not supported in concurrent mode.", RuntimeWarning)
|
|
950
|
-
|
|
951
|
-
nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
|
|
952
|
-
extraction_results = asyncio.run(self.extract_async(text_content=text_content,
|
|
953
|
-
max_new_tokens=max_new_tokens,
|
|
954
|
-
document_key=document_key,
|
|
955
|
-
temperature=temperature,
|
|
956
|
-
concurrent_batch_size=concurrent_batch_size,
|
|
957
|
-
return_messages_log=return_messages_log,
|
|
958
|
-
**kwrs)
|
|
959
|
-
)
|
|
960
|
-
else:
|
|
961
|
-
extraction_results = self.extract(text_content=text_content,
|
|
962
|
-
max_new_tokens=max_new_tokens,
|
|
963
|
-
document_key=document_key,
|
|
964
|
-
temperature=temperature,
|
|
965
|
-
stream=stream,
|
|
966
|
-
return_messages_log=return_messages_log,
|
|
967
|
-
**kwrs)
|
|
968
|
-
|
|
969
|
-
llm_output_sentences, messages_log = extraction_results if return_messages_log else (extraction_results, None)
|
|
970
|
-
|
|
971
|
-
frame_list = []
|
|
972
|
-
for sent in llm_output_sentences:
|
|
973
|
-
entity_json = []
|
|
974
|
-
for entity in self._extract_json(gen_text=sent['gen_text']):
|
|
975
|
-
if entity_key in entity:
|
|
976
|
-
entity_json.append(entity)
|
|
977
|
-
else:
|
|
978
|
-
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
|
|
979
|
-
|
|
980
|
-
spans = self._find_entity_spans(text=sent['sentence_text'],
|
|
981
|
-
entities=[e[entity_key] for e in entity_json],
|
|
982
|
-
case_sensitive=case_sensitive,
|
|
983
|
-
fuzzy_match=fuzzy_match,
|
|
984
|
-
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
985
|
-
fuzzy_score_cutoff=fuzzy_score_cutoff,
|
|
986
|
-
allow_overlap_entities=allow_overlap_entities)
|
|
987
|
-
for ent, span in zip(entity_json, spans):
|
|
988
|
-
if span is not None:
|
|
989
|
-
start, end = span
|
|
990
|
-
entity_text = sent['sentence_text'][start:end]
|
|
991
|
-
start += sent['sentence_start']
|
|
992
|
-
end += sent['sentence_start']
|
|
993
|
-
frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
|
|
994
|
-
start=start,
|
|
995
|
-
end=end,
|
|
996
|
-
entity_text=entity_text,
|
|
997
|
-
attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
|
|
998
|
-
frame_list.append(frame)
|
|
999
|
-
|
|
1000
|
-
if return_messages_log:
|
|
1001
|
-
return frame_list, messages_log
|
|
1002
|
-
return frame_list
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
1006
|
-
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
1007
|
-
review_mode:str, review_prompt:str=None, system_prompt:str=None,
|
|
1008
|
-
context_sentences:Union[str, int]="all", **kwrs):
|
|
1009
|
-
"""
|
|
1010
|
-
This class adds a review step after the SentenceFrameExtractor.
|
|
1011
|
-
For each sentence, the review process asks LLM to review its output and:
|
|
1012
|
-
1. add more frames while keeping current. This is efficient for boosting recall.
|
|
1013
|
-
2. or, regenerate frames (add new and delete existing).
|
|
1014
|
-
Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
|
|
1015
|
-
|
|
1016
|
-
Parameters:
|
|
1017
|
-
----------
|
|
1018
|
-
inference_engine : InferenceEngine
|
|
1019
|
-
the LLM inferencing engine object. Must implements the chat() method.
|
|
1020
|
-
prompt_template : str
|
|
1021
|
-
prompt template with "{{<placeholder name>}}" placeholder.
|
|
1022
|
-
review_prompt : str: Optional
|
|
1023
|
-
the prompt text that ask LLM to review. Specify addition or revision in the instruction.
|
|
1024
|
-
if not provided, a default review prompt will be used.
|
|
1025
|
-
review_mode : str
|
|
1026
|
-
review mode. Must be one of {"addition", "revision"}
|
|
1027
|
-
addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
|
|
1028
|
-
system_prompt : str, Optional
|
|
1029
|
-
system prompt.
|
|
1030
|
-
context_sentences : Union[str, int], Optional
|
|
1031
|
-
number of sentences before and after the given sentence to provide additional context.
|
|
1032
|
-
if "all", the full text will be provided in the prompt as context.
|
|
1033
|
-
if 0, no additional context will be provided.
|
|
1034
|
-
This is good for tasks that does not require context beyond the given sentence.
|
|
1035
|
-
if > 0, the number of sentences before and after the given sentence to provide as context.
|
|
1036
|
-
This is good for tasks that require context beyond the given sentence.
|
|
1037
|
-
"""
|
|
1038
|
-
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
1039
|
-
system_prompt=system_prompt, context_sentences=context_sentences, **kwrs)
|
|
1040
|
-
|
|
1041
|
-
if review_mode not in {"addition", "revision"}:
|
|
1042
|
-
raise ValueError('review_mode must be one of {"addition", "revision"}.')
|
|
1043
|
-
self.review_mode = review_mode
|
|
1044
|
-
|
|
1045
|
-
if review_prompt:
|
|
1046
|
-
self.review_prompt = review_prompt
|
|
1047
|
-
else:
|
|
1048
|
-
file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
|
|
1049
|
-
joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
|
|
1050
|
-
with open(file_path, 'r', encoding="utf-8") as f:
|
|
1051
|
-
self.review_prompt = f.read()
|
|
1052
|
-
|
|
1053
|
-
warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
1057
|
-
document_key:str=None, temperature:float=0.0, stream:bool=False, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
1058
|
-
"""
|
|
1059
|
-
This method inputs a text and outputs a list of outputs per sentence.
|
|
1107
|
+
This method inputs a text and outputs a list of outputs per unit.
|
|
1060
1108
|
|
|
1061
1109
|
Parameters:
|
|
1062
1110
|
----------
|
|
@@ -1064,281 +1112,371 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1064
1112
|
the input text content to put in prompt template.
|
|
1065
1113
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
1066
1114
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
1067
|
-
max_new_tokens :
|
|
1115
|
+
max_new_tokens : int, Optional
|
|
1068
1116
|
the max number of new tokens LLM should generate.
|
|
1069
1117
|
document_key : str, Optional
|
|
1070
1118
|
specify the key in text_content where document text is.
|
|
1071
1119
|
If text_content is str, this parameter will be ignored.
|
|
1072
1120
|
temperature : float, Optional
|
|
1073
1121
|
the temperature for token sampling.
|
|
1074
|
-
stream : bool, Optional
|
|
1075
|
-
if True, LLM generated text will be printed in terminal in real-time.
|
|
1076
|
-
return_messages_log : bool, Optional
|
|
1077
|
-
if True, a list of messages will be returned.
|
|
1078
1122
|
|
|
1079
|
-
Return :
|
|
1080
|
-
the output from LLM.
|
|
1123
|
+
Return : List[FrameExtractionUnitResult]
|
|
1124
|
+
the output from LLM for each unit. Contains the start, end, text, and generated text.
|
|
1081
1125
|
"""
|
|
1082
|
-
#
|
|
1083
|
-
output = []
|
|
1084
|
-
# sentence tokenization
|
|
1126
|
+
# unit chunking
|
|
1085
1127
|
if isinstance(text_content, str):
|
|
1086
|
-
|
|
1128
|
+
doc_text = text_content
|
|
1129
|
+
|
|
1087
1130
|
elif isinstance(text_content, dict):
|
|
1088
1131
|
if document_key is None:
|
|
1089
1132
|
raise ValueError("document_key must be provided when text_content is dict.")
|
|
1090
|
-
|
|
1133
|
+
doc_text = text_content[document_key]
|
|
1091
1134
|
|
|
1092
|
-
|
|
1093
|
-
|
|
1135
|
+
units = self.unit_chunker.chunk(doc_text)
|
|
1136
|
+
# context chunker init
|
|
1137
|
+
self.context_chunker.fit(doc_text, units)
|
|
1094
1138
|
|
|
1095
|
-
# generate
|
|
1096
|
-
for i,
|
|
1139
|
+
# generate unit by unit
|
|
1140
|
+
for i, unit in enumerate(units):
|
|
1141
|
+
# <--- Initial generation step --->
|
|
1097
1142
|
# construct chat messages
|
|
1098
1143
|
messages = []
|
|
1099
1144
|
if self.system_prompt:
|
|
1100
1145
|
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
1101
1146
|
|
|
1102
|
-
context = self.
|
|
1147
|
+
context = self.context_chunker.chunk(unit)
|
|
1103
1148
|
|
|
1104
|
-
if
|
|
1105
|
-
# no context, just place
|
|
1149
|
+
if context == "":
|
|
1150
|
+
# no context, just place unit in user prompt
|
|
1106
1151
|
if isinstance(text_content, str):
|
|
1107
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(
|
|
1152
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
|
|
1108
1153
|
else:
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(
|
|
1154
|
+
unit_content = text_content.copy()
|
|
1155
|
+
unit_content[document_key] = unit.text
|
|
1156
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
|
|
1112
1157
|
else:
|
|
1113
|
-
# insert context
|
|
1158
|
+
# insert context to user prompt
|
|
1114
1159
|
if isinstance(text_content, str):
|
|
1115
1160
|
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
1116
1161
|
else:
|
|
1117
1162
|
context_content = text_content.copy()
|
|
1118
1163
|
context_content[document_key] = context
|
|
1119
1164
|
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
1120
|
-
# simulate conversation
|
|
1121
|
-
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
1122
|
-
# place
|
|
1123
|
-
messages.append({'role': 'user', 'content':
|
|
1165
|
+
# simulate conversation where assistant confirms
|
|
1166
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
|
|
1167
|
+
# place unit of interest
|
|
1168
|
+
messages.append({'role': 'user', 'content': unit.text})
|
|
1124
1169
|
|
|
1125
|
-
if stream:
|
|
1126
|
-
print(f"\n\n{Fore.GREEN}Sentence {i}: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
1127
|
-
if isinstance(self.context_sentences, int) and self.context_sentences > 0:
|
|
1128
|
-
print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
|
|
1129
|
-
print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
|
|
1130
1170
|
|
|
1131
|
-
|
|
1171
|
+
yield f"\n\n{Fore.GREEN}Unit {i}:{Style.RESET_ALL}\n{unit.text}\n"
|
|
1172
|
+
if context != "":
|
|
1173
|
+
yield f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n"
|
|
1174
|
+
|
|
1175
|
+
yield f"{Fore.BLUE}Extraction:{Style.RESET_ALL}\n"
|
|
1176
|
+
|
|
1177
|
+
response_stream = self.inference_engine.chat(
|
|
1132
1178
|
messages=messages,
|
|
1133
1179
|
max_new_tokens=max_new_tokens,
|
|
1134
1180
|
temperature=temperature,
|
|
1135
|
-
stream=
|
|
1181
|
+
stream=True,
|
|
1136
1182
|
**kwrs
|
|
1137
1183
|
)
|
|
1138
1184
|
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1185
|
+
initial = ""
|
|
1186
|
+
for chunk in response_stream:
|
|
1187
|
+
initial += chunk
|
|
1188
|
+
yield chunk
|
|
1189
|
+
|
|
1190
|
+
# <--- Review step --->
|
|
1191
|
+
yield f"\n{Fore.YELLOW}Review:{Style.RESET_ALL}"
|
|
1142
1192
|
|
|
1143
1193
|
messages.append({'role': 'assistant', 'content': initial})
|
|
1144
1194
|
messages.append({'role': 'user', 'content': self.review_prompt})
|
|
1145
1195
|
|
|
1146
|
-
|
|
1196
|
+
response_stream = self.inference_engine.chat(
|
|
1147
1197
|
messages=messages,
|
|
1148
1198
|
max_new_tokens=max_new_tokens,
|
|
1149
1199
|
temperature=temperature,
|
|
1150
|
-
stream=
|
|
1200
|
+
stream=True,
|
|
1151
1201
|
**kwrs
|
|
1152
1202
|
)
|
|
1153
1203
|
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
gen_text = review
|
|
1157
|
-
elif self.review_mode == "addition":
|
|
1158
|
-
gen_text = initial + '\n' + review
|
|
1159
|
-
|
|
1160
|
-
if return_messages_log:
|
|
1161
|
-
messages.append({"role": "assistant", "content": review})
|
|
1162
|
-
messages_log.append(messages)
|
|
1204
|
+
for chunk in response_stream:
|
|
1205
|
+
yield chunk
|
|
1163
1206
|
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
'sentence_end': sent['end'],
|
|
1167
|
-
'sentence_text': sent['sentence_text'],
|
|
1168
|
-
'gen_text': gen_text})
|
|
1169
|
-
|
|
1170
|
-
if return_messages_log:
|
|
1171
|
-
return output, messages_log
|
|
1172
|
-
|
|
1173
|
-
return output
|
|
1174
|
-
|
|
1175
|
-
async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
1176
|
-
document_key:str=None, temperature:float=0.0, concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
1207
|
+
async def extract_async(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=2048, document_key:str=None, temperature:float=0.0,
|
|
1208
|
+
concurrent_batch_size:int=32, return_messages_log:bool=False, **kwrs) -> List[FrameExtractionUnitResult]:
|
|
1177
1209
|
"""
|
|
1178
|
-
|
|
1210
|
+
This is the asynchronous version of the extract() method with the review step.
|
|
1179
1211
|
|
|
1180
1212
|
Parameters:
|
|
1181
1213
|
----------
|
|
1182
1214
|
text_content : Union[str, Dict[str,str]]
|
|
1183
|
-
the input text content to put in prompt template.
|
|
1215
|
+
the input text content to put in prompt template.
|
|
1184
1216
|
If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
|
|
1185
1217
|
If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
|
|
1186
|
-
max_new_tokens :
|
|
1187
|
-
the max number of new tokens LLM should generate.
|
|
1218
|
+
max_new_tokens : int, Optional
|
|
1219
|
+
the max number of new tokens LLM should generate.
|
|
1188
1220
|
document_key : str, Optional
|
|
1189
|
-
specify the key in text_content where document text is.
|
|
1221
|
+
specify the key in text_content where document text is.
|
|
1190
1222
|
If text_content is str, this parameter will be ignored.
|
|
1191
1223
|
temperature : float, Optional
|
|
1192
1224
|
the temperature for token sampling.
|
|
1193
1225
|
concurrent_batch_size : int, Optional
|
|
1194
|
-
the
|
|
1226
|
+
the batch size for concurrent processing.
|
|
1195
1227
|
return_messages_log : bool, Optional
|
|
1196
|
-
if True, a list of messages will be returned.
|
|
1228
|
+
if True, a list of messages will be returned, including review steps.
|
|
1197
1229
|
|
|
1198
|
-
Return :
|
|
1199
|
-
the output from LLM.
|
|
1230
|
+
Return : List[FrameExtractionUnitResult]
|
|
1231
|
+
the output from LLM for each unit after review. Contains the start, end, text, and generated text.
|
|
1200
1232
|
"""
|
|
1201
|
-
# Check if self.inference_engine.chat_async() is implemented
|
|
1202
|
-
if not hasattr(self.inference_engine, 'chat_async'):
|
|
1203
|
-
raise NotImplementedError(f"{self.inference_engine.__class__.__name__} does not have chat_async() method.")
|
|
1204
|
-
|
|
1205
|
-
# define output
|
|
1206
|
-
output = []
|
|
1207
|
-
# sentence tokenization
|
|
1208
1233
|
if isinstance(text_content, str):
|
|
1209
|
-
|
|
1234
|
+
doc_text = text_content
|
|
1210
1235
|
elif isinstance(text_content, dict):
|
|
1211
1236
|
if document_key is None:
|
|
1212
1237
|
raise ValueError("document_key must be provided when text_content is dict.")
|
|
1213
|
-
|
|
1238
|
+
if document_key not in text_content:
|
|
1239
|
+
raise ValueError(f"document_key '{document_key}' not found in text_content dictionary.")
|
|
1240
|
+
doc_text = text_content[document_key]
|
|
1241
|
+
else:
|
|
1242
|
+
raise TypeError("text_content must be a string or a dictionary.")
|
|
1214
1243
|
|
|
1215
|
-
|
|
1216
|
-
messages_log = []
|
|
1244
|
+
units = self.unit_chunker.chunk(doc_text)
|
|
1217
1245
|
|
|
1218
|
-
#
|
|
1219
|
-
|
|
1220
|
-
messages_list = []
|
|
1221
|
-
init_tasks = []
|
|
1222
|
-
review_tasks = []
|
|
1223
|
-
batch = sentences[i:i + concurrent_batch_size]
|
|
1224
|
-
for j, sent in enumerate(batch):
|
|
1225
|
-
# construct chat messages
|
|
1226
|
-
messages = []
|
|
1227
|
-
if self.system_prompt:
|
|
1228
|
-
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
1246
|
+
# context chunker init
|
|
1247
|
+
self.context_chunker.fit(doc_text, units)
|
|
1229
1248
|
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1249
|
+
# <--- Initial generation step --->
|
|
1250
|
+
initial_tasks_input = []
|
|
1251
|
+
for i, unit in enumerate(units):
|
|
1252
|
+
# construct chat messages for initial generation
|
|
1253
|
+
messages = []
|
|
1254
|
+
if self.system_prompt:
|
|
1255
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
1256
|
+
|
|
1257
|
+
context = self.context_chunker.chunk(unit)
|
|
1258
|
+
|
|
1259
|
+
if context == "":
|
|
1260
|
+
# no context, just place unit in user prompt
|
|
1261
|
+
if isinstance(text_content, str):
|
|
1262
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(unit.text)})
|
|
1240
1263
|
else:
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
messages.append({'role': 'user', 'content':
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1264
|
+
unit_content = text_content.copy()
|
|
1265
|
+
unit_content[document_key] = unit.text
|
|
1266
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(unit_content)})
|
|
1267
|
+
else:
|
|
1268
|
+
# insert context to user prompt
|
|
1269
|
+
if isinstance(text_content, str):
|
|
1270
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
1271
|
+
else:
|
|
1272
|
+
context_content = text_content.copy()
|
|
1273
|
+
context_content[document_key] = context
|
|
1274
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
1275
|
+
# simulate conversation where assistant confirms
|
|
1276
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the unit text (e.g., sentence, line, chunk) of interest.'})
|
|
1277
|
+
# place unit of interest
|
|
1278
|
+
messages.append({'role': 'user', 'content': unit.text})
|
|
1279
|
+
|
|
1280
|
+
# Store unit and messages together for the initial task
|
|
1281
|
+
initial_tasks_input.append({"unit": unit, "messages": messages, "original_index": i})
|
|
1282
|
+
|
|
1283
|
+
semaphore = asyncio.Semaphore(concurrent_batch_size)
|
|
1284
|
+
|
|
1285
|
+
async def initial_semaphore_helper(task_data: Dict, max_new_tokens: int, temperature: float, **kwrs):
|
|
1286
|
+
unit = task_data["unit"]
|
|
1287
|
+
messages = task_data["messages"]
|
|
1288
|
+
original_index = task_data["original_index"]
|
|
1289
|
+
|
|
1290
|
+
async with semaphore:
|
|
1291
|
+
gen_text = await self.inference_engine.chat_async(
|
|
1292
|
+
messages=messages,
|
|
1293
|
+
max_new_tokens=max_new_tokens,
|
|
1294
|
+
temperature=temperature,
|
|
1295
|
+
**kwrs
|
|
1262
1296
|
)
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1297
|
+
# Return initial generation result along with the messages used and the unit
|
|
1298
|
+
return {"original_index": original_index, "unit": unit, "initial_gen_text": gen_text, "initial_messages": messages}
|
|
1299
|
+
|
|
1300
|
+
# Create and gather initial generation tasks
|
|
1301
|
+
initial_tasks = [
|
|
1302
|
+
asyncio.create_task(initial_semaphore_helper(
|
|
1303
|
+
task_inp,
|
|
1304
|
+
max_new_tokens=max_new_tokens,
|
|
1305
|
+
temperature=temperature,
|
|
1306
|
+
**kwrs
|
|
1307
|
+
))
|
|
1308
|
+
for task_inp in initial_tasks_input
|
|
1309
|
+
]
|
|
1310
|
+
|
|
1311
|
+
initial_results_raw = await asyncio.gather(*initial_tasks)
|
|
1312
|
+
|
|
1313
|
+
# Sort initial results back into original order
|
|
1314
|
+
initial_results_raw.sort(key=lambda x: x["original_index"])
|
|
1315
|
+
|
|
1316
|
+
# <--- Review step --->
|
|
1317
|
+
review_tasks_input = []
|
|
1318
|
+
for result_data in initial_results_raw:
|
|
1319
|
+
# Prepare messages for the review step
|
|
1320
|
+
initial_messages = result_data["initial_messages"]
|
|
1321
|
+
initial_gen_text = result_data["initial_gen_text"]
|
|
1322
|
+
review_messages = initial_messages + [
|
|
1323
|
+
{'role': 'assistant', 'content': initial_gen_text},
|
|
1324
|
+
{'role': 'user', 'content': self.review_prompt}
|
|
1325
|
+
]
|
|
1326
|
+
# Store data needed for review task
|
|
1327
|
+
review_tasks_input.append({
|
|
1328
|
+
"unit": result_data["unit"],
|
|
1329
|
+
"initial_gen_text": initial_gen_text,
|
|
1330
|
+
"messages": review_messages,
|
|
1331
|
+
"original_index": result_data["original_index"],
|
|
1332
|
+
"full_initial_log": initial_messages + [{'role': 'assistant', 'content': initial_gen_text}] if return_messages_log else None # Log up to initial generation
|
|
1333
|
+
})
|
|
1334
|
+
|
|
1335
|
+
|
|
1336
|
+
async def review_semaphore_helper(task_data: Dict, max_new_tokens: int, temperature: float, **kwrs):
|
|
1337
|
+
messages = task_data["messages"]
|
|
1338
|
+
original_index = task_data["original_index"]
|
|
1339
|
+
|
|
1340
|
+
async with semaphore:
|
|
1341
|
+
review_gen_text = await self.inference_engine.chat_async(
|
|
1342
|
+
messages=messages,
|
|
1343
|
+
max_new_tokens=max_new_tokens,
|
|
1344
|
+
temperature=temperature,
|
|
1345
|
+
**kwrs
|
|
1346
|
+
)
|
|
1347
|
+
# Combine initial and review results
|
|
1348
|
+
task_data["review_gen_text"] = review_gen_text
|
|
1349
|
+
if return_messages_log:
|
|
1350
|
+
# Log for the review call itself
|
|
1351
|
+
task_data["full_review_log"] = messages + [{'role': 'assistant', 'content': review_gen_text}]
|
|
1352
|
+
return task_data # Return the augmented dictionary
|
|
1353
|
+
|
|
1354
|
+
# Create and gather review tasks
|
|
1355
|
+
review_tasks = [
|
|
1356
|
+
asyncio.create_task(review_semaphore_helper(
|
|
1357
|
+
task_inp,
|
|
1358
|
+
max_new_tokens=max_new_tokens,
|
|
1359
|
+
temperature=temperature,
|
|
1360
|
+
**kwrs
|
|
1361
|
+
))
|
|
1362
|
+
for task_inp in review_tasks_input
|
|
1363
|
+
]
|
|
1364
|
+
|
|
1365
|
+
final_results_raw = await asyncio.gather(*review_tasks)
|
|
1366
|
+
|
|
1367
|
+
# Sort final results back into original order (although gather might preserve order for tasks added sequentially)
|
|
1368
|
+
final_results_raw.sort(key=lambda x: x["original_index"])
|
|
1369
|
+
|
|
1370
|
+
# <--- Process final results --->
|
|
1371
|
+
output: List[FrameExtractionUnitResult] = []
|
|
1372
|
+
messages_log: Optional[List[List[Dict[str, str]]]] = [] if return_messages_log else None
|
|
1373
|
+
|
|
1374
|
+
for result_data in final_results_raw:
|
|
1375
|
+
unit = result_data["unit"]
|
|
1376
|
+
initial_gen = result_data["initial_gen_text"]
|
|
1377
|
+
review_gen = result_data["review_gen_text"]
|
|
1378
|
+
|
|
1379
|
+
# Combine based on review mode
|
|
1380
|
+
if self.review_mode == "revision":
|
|
1381
|
+
final_gen_text = review_gen
|
|
1382
|
+
elif self.review_mode == "addition":
|
|
1383
|
+
final_gen_text = initial_gen + '\n' + review_gen
|
|
1384
|
+
else: # Should not happen due to init check
|
|
1385
|
+
final_gen_text = review_gen # Default to revision if mode is somehow invalid
|
|
1386
|
+
|
|
1387
|
+
# Create final result object
|
|
1388
|
+
result = FrameExtractionUnitResult(
|
|
1389
|
+
start=unit.start,
|
|
1390
|
+
end=unit.end,
|
|
1391
|
+
text=unit.text,
|
|
1392
|
+
gen_text=final_gen_text # Use the combined/reviewed text
|
|
1393
|
+
)
|
|
1394
|
+
output.append(result)
|
|
1395
|
+
|
|
1396
|
+
# Append full conversation log if requested
|
|
1397
|
+
if return_messages_log:
|
|
1398
|
+
full_log_for_unit = result_data.get("full_initial_log", []) + [{'role': 'user', 'content': self.review_prompt}] + [{'role': 'assistant', 'content': review_gen}]
|
|
1399
|
+
messages_log.append(full_log_for_unit)
|
|
1307
1400
|
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1401
|
+
if return_messages_log:
|
|
1402
|
+
return output, messages_log
|
|
1403
|
+
else:
|
|
1404
|
+
return output
|
|
1405
|
+
|
|
1406
|
+
|
|
1407
|
+
class BasicFrameExtractor(DirectFrameExtractor):
|
|
1408
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
|
|
1409
|
+
"""
|
|
1410
|
+
This class diretly prompt LLM for frame extraction.
|
|
1411
|
+
Input system prompt (optional), prompt template (with instruction, few-shot examples),
|
|
1412
|
+
and specify a LLM.
|
|
1312
1413
|
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1414
|
+
Parameters:
|
|
1415
|
+
----------
|
|
1416
|
+
inference_engine : InferenceEngine
|
|
1417
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
1418
|
+
prompt_template : str
|
|
1419
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
1420
|
+
system_prompt : str, Optional
|
|
1421
|
+
system prompt.
|
|
1422
|
+
"""
|
|
1423
|
+
super().__init__(inference_engine=inference_engine,
|
|
1424
|
+
unit_chunker=WholeDocumentUnitChunker(),
|
|
1425
|
+
prompt_template=prompt_template,
|
|
1426
|
+
system_prompt=system_prompt,
|
|
1427
|
+
context_chunker=NoContextChunker(),
|
|
1428
|
+
**kwrs)
|
|
1318
1429
|
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1430
|
+
class BasicReviewFrameExtractor(ReviewFrameExtractor):
|
|
1431
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, review_mode:str, review_prompt:str=None, system_prompt:str=None, **kwrs):
|
|
1432
|
+
"""
|
|
1433
|
+
This class add a review step after the BasicFrameExtractor.
|
|
1434
|
+
The Review process asks LLM to review its output and:
|
|
1435
|
+
1. add more frames while keep current. This is efficient for boosting recall.
|
|
1436
|
+
2. or, regenerate frames (add new and delete existing).
|
|
1437
|
+
Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
|
|
1438
|
+
|
|
1439
|
+
Parameters:
|
|
1440
|
+
----------
|
|
1441
|
+
inference_engine : InferenceEngine
|
|
1442
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
1443
|
+
prompt_template : str
|
|
1444
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
1445
|
+
review_prompt : str: Optional
|
|
1446
|
+
the prompt text that ask LLM to review. Specify addition or revision in the instruction.
|
|
1447
|
+
if not provided, a default review prompt will be used.
|
|
1448
|
+
review_mode : str
|
|
1449
|
+
review mode. Must be one of {"addition", "revision"}
|
|
1450
|
+
addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
|
|
1451
|
+
system_prompt : str, Optional
|
|
1452
|
+
system prompt.
|
|
1453
|
+
"""
|
|
1454
|
+
super().__init__(inference_engine=inference_engine,
|
|
1455
|
+
unit_chunker=WholeDocumentUnitChunker(),
|
|
1456
|
+
prompt_template=prompt_template,
|
|
1457
|
+
review_mode=review_mode,
|
|
1458
|
+
review_prompt=review_prompt,
|
|
1459
|
+
system_prompt=system_prompt,
|
|
1460
|
+
context_chunker=NoContextChunker(),
|
|
1461
|
+
**kwrs)
|
|
1322
1462
|
|
|
1323
1463
|
|
|
1324
|
-
class
|
|
1325
|
-
|
|
1326
|
-
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
|
|
1464
|
+
class SentenceFrameExtractor(DirectFrameExtractor):
|
|
1465
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
|
|
1327
1466
|
context_sentences:Union[str, int]="all", **kwrs):
|
|
1328
1467
|
"""
|
|
1329
|
-
This class performs sentence-
|
|
1330
|
-
|
|
1468
|
+
This class performs sentence-by-sentence information extraction.
|
|
1469
|
+
The process is as follows:
|
|
1331
1470
|
1. system prompt (optional)
|
|
1332
|
-
2. user instructions (schema, background, full text, few-shot example...)
|
|
1333
|
-
3.
|
|
1334
|
-
4.
|
|
1335
|
-
5.
|
|
1336
|
-
6. repeat #3, #4, #5
|
|
1471
|
+
2. user prompt with instructions (schema, background, full text, few-shot example...)
|
|
1472
|
+
3. feed a sentence (start with first sentence)
|
|
1473
|
+
4. LLM extract entities and attributes from the sentence
|
|
1474
|
+
5. iterate to the next sentence and repeat steps 3-4 until all sentences are processed.
|
|
1337
1475
|
|
|
1338
1476
|
Input system prompt (optional), prompt template (with user instructions),
|
|
1339
1477
|
and specify a LLM.
|
|
1340
1478
|
|
|
1341
|
-
Parameters
|
|
1479
|
+
Parameters:
|
|
1342
1480
|
----------
|
|
1343
1481
|
inference_engine : InferenceEngine
|
|
1344
1482
|
the LLM inferencing engine object. Must implements the chat() method.
|
|
@@ -1354,104 +1492,77 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
|
1354
1492
|
if > 0, the number of sentences before and after the given sentence to provide as context.
|
|
1355
1493
|
This is good for tasks that require context beyond the given sentence.
|
|
1356
1494
|
"""
|
|
1357
|
-
|
|
1358
|
-
|
|
1495
|
+
if not isinstance(context_sentences, int) and context_sentences != "all":
|
|
1496
|
+
raise ValueError('context_sentences must be an integer (>= 0) or "all".')
|
|
1497
|
+
|
|
1498
|
+
if isinstance(context_sentences, int) and context_sentences < 0:
|
|
1499
|
+
raise ValueError("context_sentences must be a positive integer.")
|
|
1500
|
+
|
|
1501
|
+
if isinstance(context_sentences, int):
|
|
1502
|
+
context_chunker = SlideWindowContextChunker(window_size=context_sentences)
|
|
1503
|
+
elif context_sentences == "all":
|
|
1504
|
+
context_chunker = WholeDocumentContextChunker()
|
|
1505
|
+
|
|
1506
|
+
super().__init__(inference_engine=inference_engine,
|
|
1507
|
+
unit_chunker=SentenceUnitChunker(),
|
|
1508
|
+
prompt_template=prompt_template,
|
|
1509
|
+
system_prompt=system_prompt,
|
|
1510
|
+
context_chunker=context_chunker,
|
|
1511
|
+
**kwrs)
|
|
1359
1512
|
|
|
1360
1513
|
|
|
1361
|
-
|
|
1362
|
-
|
|
1514
|
+
class SentenceReviewFrameExtractor(ReviewFrameExtractor):
|
|
1515
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
1516
|
+
review_mode:str, review_prompt:str=None, system_prompt:str=None,
|
|
1517
|
+
context_sentences:Union[str, int]="all", **kwrs):
|
|
1363
1518
|
"""
|
|
1364
|
-
This
|
|
1519
|
+
This class adds a review step after the SentenceFrameExtractor.
|
|
1520
|
+
For each sentence, the review process asks LLM to review its output and:
|
|
1521
|
+
1. add more frames while keeping current. This is efficient for boosting recall.
|
|
1522
|
+
2. or, regenerate frames (add new and delete existing).
|
|
1523
|
+
Use the review_mode parameter to specify. Note that the review_prompt should instruct LLM accordingly.
|
|
1365
1524
|
|
|
1366
1525
|
Parameters:
|
|
1367
1526
|
----------
|
|
1368
|
-
|
|
1369
|
-
the
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
the
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
if
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
the
|
|
1527
|
+
inference_engine : InferenceEngine
|
|
1528
|
+
the LLM inferencing engine object. Must implements the chat() method.
|
|
1529
|
+
prompt_template : str
|
|
1530
|
+
prompt template with "{{<placeholder name>}}" placeholder.
|
|
1531
|
+
review_prompt : str: Optional
|
|
1532
|
+
the prompt text that ask LLM to review. Specify addition or revision in the instruction.
|
|
1533
|
+
if not provided, a default review prompt will be used.
|
|
1534
|
+
review_mode : str
|
|
1535
|
+
review mode. Must be one of {"addition", "revision"}
|
|
1536
|
+
addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
|
|
1537
|
+
system_prompt : str, Optional
|
|
1538
|
+
system prompt.
|
|
1539
|
+
context_sentences : Union[str, int], Optional
|
|
1540
|
+
number of sentences before and after the given sentence to provide additional context.
|
|
1541
|
+
if "all", the full text will be provided in the prompt as context.
|
|
1542
|
+
if 0, no additional context will be provided.
|
|
1543
|
+
This is good for tasks that does not require context beyond the given sentence.
|
|
1544
|
+
if > 0, the number of sentences before and after the given sentence to provide as context.
|
|
1545
|
+
This is good for tasks that require context beyond the given sentence.
|
|
1386
1546
|
"""
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
# sentence tokenization
|
|
1390
|
-
if isinstance(text_content, str):
|
|
1391
|
-
sentences = self._get_sentences(text_content)
|
|
1392
|
-
elif isinstance(text_content, dict):
|
|
1393
|
-
sentences = self._get_sentences(text_content[document_key])
|
|
1394
|
-
|
|
1395
|
-
if return_messages_log:
|
|
1396
|
-
messages_log = []
|
|
1397
|
-
|
|
1398
|
-
# generate sentence by sentence
|
|
1399
|
-
for i, sent in enumerate(sentences):
|
|
1400
|
-
# construct chat messages
|
|
1401
|
-
messages = []
|
|
1402
|
-
if self.system_prompt:
|
|
1403
|
-
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
1404
|
-
|
|
1405
|
-
context = self._get_context_sentences(text_content, i, sentences, document_key)
|
|
1406
|
-
|
|
1407
|
-
if self.context_sentences == 0:
|
|
1408
|
-
# no context, just place sentence of interest
|
|
1409
|
-
if isinstance(text_content, str):
|
|
1410
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
1411
|
-
else:
|
|
1412
|
-
sentence_content = text_content.copy()
|
|
1413
|
-
sentence_content[document_key] = sent['sentence_text']
|
|
1414
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(sentence_content)})
|
|
1415
|
-
else:
|
|
1416
|
-
# insert context
|
|
1417
|
-
if isinstance(text_content, str):
|
|
1418
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
1419
|
-
else:
|
|
1420
|
-
context_content = text_content.copy()
|
|
1421
|
-
context_content[document_key] = context
|
|
1422
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(context_content)})
|
|
1423
|
-
# simulate conversation
|
|
1424
|
-
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
1425
|
-
# place sentence of interest
|
|
1426
|
-
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
1427
|
-
|
|
1428
|
-
if stream:
|
|
1429
|
-
print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
1430
|
-
if isinstance(self.context_sentences, int) and self.context_sentences > 0:
|
|
1431
|
-
print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
|
|
1432
|
-
print(f"{Fore.BLUE}CoT:{Style.RESET_ALL}")
|
|
1433
|
-
|
|
1434
|
-
gen_text = self.inference_engine.chat(
|
|
1435
|
-
messages=messages,
|
|
1436
|
-
max_new_tokens=max_new_tokens,
|
|
1437
|
-
temperature=temperature,
|
|
1438
|
-
stream=stream,
|
|
1439
|
-
**kwrs
|
|
1440
|
-
)
|
|
1441
|
-
|
|
1442
|
-
if return_messages_log:
|
|
1443
|
-
messages.append({"role": "assistant", "content": gen_text})
|
|
1444
|
-
messages_log.append(messages)
|
|
1445
|
-
|
|
1446
|
-
# add to output
|
|
1447
|
-
output.append({'sentence_start': sent['start'],
|
|
1448
|
-
'sentence_end': sent['end'],
|
|
1449
|
-
'sentence_text': sent['sentence_text'],
|
|
1450
|
-
'gen_text': gen_text})
|
|
1547
|
+
if not isinstance(context_sentences, int) and context_sentences != "all":
|
|
1548
|
+
raise ValueError('context_sentences must be an integer (>= 0) or "all".')
|
|
1451
1549
|
|
|
1452
|
-
if
|
|
1453
|
-
|
|
1454
|
-
|
|
1550
|
+
if isinstance(context_sentences, int) and context_sentences < 0:
|
|
1551
|
+
raise ValueError("context_sentences must be a positive integer.")
|
|
1552
|
+
|
|
1553
|
+
if isinstance(context_sentences, int):
|
|
1554
|
+
context_chunker = SlideWindowContextChunker(window_size=context_sentences)
|
|
1555
|
+
elif context_sentences == "all":
|
|
1556
|
+
context_chunker = WholeDocumentContextChunker()
|
|
1557
|
+
|
|
1558
|
+
super().__init__(inference_engine=inference_engine,
|
|
1559
|
+
unit_chunker=SentenceUnitChunker(),
|
|
1560
|
+
prompt_template=prompt_template,
|
|
1561
|
+
review_mode=review_mode,
|
|
1562
|
+
review_prompt=review_prompt,
|
|
1563
|
+
system_prompt=system_prompt,
|
|
1564
|
+
context_chunker=context_chunker,
|
|
1565
|
+
**kwrs)
|
|
1455
1566
|
|
|
1456
1567
|
|
|
1457
1568
|
class RelationExtractor(Extractor):
|