llm-ie 0.4.4__tar.gz → 0.4.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llm_ie-0.4.4 → llm_ie-0.4.6}/PKG-INFO +39 -5
- {llm_ie-0.4.4 → llm_ie-0.4.6}/README.md +38 -4
- {llm_ie-0.4.4 → llm_ie-0.4.6}/pyproject.toml +3 -2
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/SentenceCoTFrameExtractor_prompt_guide.txt +4 -4
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +4 -4
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +4 -4
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/engines.py +119 -34
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/extractors.py +238 -169
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/__init__.py +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/data_types.py +0 -0
- {llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/prompt_editor.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llm-ie
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.6
|
|
4
4
|
Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -41,6 +41,10 @@ An LLM-powered tool that transforms everyday language into robust information ex
|
|
|
41
41
|
- Support for LiteLLM.
|
|
42
42
|
- [v0.4.1](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.1) (Jan 25, 2025): Added filters, table view, and some new features to visualization tool (make sure to update [ie-viz](https://github.com/daviden1013/ie-viz)).
|
|
43
43
|
- [v0.4.3](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.3) (Feb 7, 2025): Added Azure OpenAI support.
|
|
44
|
+
- [v0.4.5](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.5) (Feb 16, 2025):
|
|
45
|
+
- Added option to adjust number of context sentences in sentence-based extractors.
|
|
46
|
+
- Added support for OpenAI reasoning models ("o" series).
|
|
47
|
+
- [v0.4.6](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.6) (Mar 1, 2025): Allow LLM to output overlapping frames.
|
|
44
48
|
|
|
45
49
|
## Table of Contents
|
|
46
50
|
- [Overview](#overview)
|
|
@@ -340,6 +344,14 @@ from llm_ie.engines import OpenAIInferenceEngine
|
|
|
340
344
|
inference_engine = OpenAIInferenceEngine(model="gpt-4o-mini")
|
|
341
345
|
```
|
|
342
346
|
|
|
347
|
+
For reasoning models ("o" series), use the `reasoning_model=True` flag. The `max_completion_tokens` will be used instead of the `max_tokens`. `temperature` will be ignored.
|
|
348
|
+
|
|
349
|
+
```python
|
|
350
|
+
from llm_ie.engines import OpenAIInferenceEngine
|
|
351
|
+
|
|
352
|
+
inference_engine = OpenAIInferenceEngine(model="o1-mini", reasoning_model=True)
|
|
353
|
+
```
|
|
354
|
+
|
|
343
355
|
#### <img src=doc_asset/readme_img/Azure_icon.png width=32 /> Azure OpenAI API
|
|
344
356
|
In bash, save the endpoint name and API key to environmental variables `AZURE_OPENAI_ENDPOINT` and `AZURE_OPENAI_API_KEY`.
|
|
345
357
|
```
|
|
@@ -356,6 +368,14 @@ from llm_ie.engines import AzureOpenAIInferenceEngine
|
|
|
356
368
|
inference_engine = AzureOpenAIInferenceEngine(model="gpt-4o-mini")
|
|
357
369
|
```
|
|
358
370
|
|
|
371
|
+
For reasoning models ("o" series), use the `reasoning_model=True` flag. The `max_completion_tokens` will be used instead of the `max_tokens`. `temperature` will be ignored.
|
|
372
|
+
|
|
373
|
+
```python
|
|
374
|
+
from llm_ie.engines import AzureOpenAIInferenceEngine
|
|
375
|
+
|
|
376
|
+
inference_engine = AzureOpenAIInferenceEngine(model="o1-mini", reasoning_model=True)
|
|
377
|
+
```
|
|
378
|
+
|
|
359
379
|
#### 🤗 huggingface_hub
|
|
360
380
|
The ```model``` can be a model id hosted on the Hugging Face Hub or a URL to a deployed Inference Endpoint. Refer to the [Inference Client](https://huggingface.co/docs/huggingface_hub/en/package_reference/inference_client) documentation for more details.
|
|
361
381
|
|
|
@@ -783,7 +803,7 @@ frames = extractor.extract_frames(text_content=text, entity_key="Diagnosis", str
|
|
|
783
803
|
|
|
784
804
|
The ```SentenceFrameExtractor``` instructs the LLM to extract sentence by sentence. The reason is to ensure the accuracy of frame spans. It also prevents LLMs from overseeing sections/ sentences. Empirically, this extractor results in better recall than the ```BasicFrameExtractor``` in complex tasks.
|
|
785
805
|
|
|
786
|
-
For concurrent extraction (recommended), the `async/
|
|
806
|
+
For concurrent extraction (recommended), the `async/await` feature is used to speed up inferencing. The `concurrent_batch_size` sets the batch size of sentences to be processed in cocurrent.
|
|
787
807
|
|
|
788
808
|
```python
|
|
789
809
|
from llm_ie.extractors import SentenceFrameExtractor
|
|
@@ -792,15 +812,29 @@ extractor = SentenceFrameExtractor(inference_engine, prompt_temp)
|
|
|
792
812
|
frames = extractor.extract_frames(text_content=text, entity_key="Diagnosis", case_sensitive=False, fuzzy_match=True, concurrent=True, concurrent_batch_size=32)
|
|
793
813
|
```
|
|
794
814
|
|
|
795
|
-
The
|
|
815
|
+
The `context_sentences` sets number of sentences before and after the sentence of interest to provide additional context. When `context_sentences=2`, 2 sentences before and 2 sentences after are included in the user prompt as context. When `context_sentences="all"`, the entire document is included as context. When `context_sentences=0`, no context is provided and LLM will only extract based on the current sentence of interest.
|
|
796
816
|
|
|
797
817
|
```python
|
|
798
818
|
from llm_ie.extractors import SentenceFrameExtractor
|
|
799
819
|
|
|
800
|
-
extractor = SentenceFrameExtractor(inference_engine,
|
|
801
|
-
|
|
820
|
+
extractor = SentenceFrameExtractor(inference_engine=inference_engine,
|
|
821
|
+
prompt_template=prompt_temp,
|
|
822
|
+
context_sentences=2)
|
|
823
|
+
frames = extractor.extract_frames(text_content=text, entity_key="Diagnosis", case_sensitive=False, fuzzy_match=True, stream=True)
|
|
802
824
|
```
|
|
803
825
|
|
|
826
|
+
For the sentence:
|
|
827
|
+
|
|
828
|
+
*The patient has a history of hypertension, hyperlipidemia, and Type 2 diabetes mellitus.*
|
|
829
|
+
|
|
830
|
+
The context is "previous sentence 2" "previous sentence 1" "the sentence of interest" "proceeding sentence 1" "proceeding sentence 2":
|
|
831
|
+
|
|
832
|
+
*Emily Brown, MD (Cardiology), Dr. Michael Green, MD (Pulmonology)
|
|
833
|
+
|
|
834
|
+
*#### Reason for Admission*
|
|
835
|
+
*John Doe, a 49-year-old male, was admitted to the hospital with complaints of chest pain, shortness of breath, and dizziness. The patient has a history of hypertension, hyperlipidemia, and Type 2 diabetes mellitus. #### History of Present Illness*
|
|
836
|
+
*The patient reported that the chest pain started two days prior to admission. The pain was described as a pressure-like sensation in the central chest, radiating to the left arm and jaw.*
|
|
837
|
+
|
|
804
838
|
</details>
|
|
805
839
|
|
|
806
840
|
<details>
|
|
@@ -24,6 +24,10 @@ An LLM-powered tool that transforms everyday language into robust information ex
|
|
|
24
24
|
- Support for LiteLLM.
|
|
25
25
|
- [v0.4.1](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.1) (Jan 25, 2025): Added filters, table view, and some new features to visualization tool (make sure to update [ie-viz](https://github.com/daviden1013/ie-viz)).
|
|
26
26
|
- [v0.4.3](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.3) (Feb 7, 2025): Added Azure OpenAI support.
|
|
27
|
+
- [v0.4.5](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.5) (Feb 16, 2025):
|
|
28
|
+
- Added option to adjust number of context sentences in sentence-based extractors.
|
|
29
|
+
- Added support for OpenAI reasoning models ("o" series).
|
|
30
|
+
- [v0.4.6](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.6) (Mar 1, 2025): Allow LLM to output overlapping frames.
|
|
27
31
|
|
|
28
32
|
## Table of Contents
|
|
29
33
|
- [Overview](#overview)
|
|
@@ -323,6 +327,14 @@ from llm_ie.engines import OpenAIInferenceEngine
|
|
|
323
327
|
inference_engine = OpenAIInferenceEngine(model="gpt-4o-mini")
|
|
324
328
|
```
|
|
325
329
|
|
|
330
|
+
For reasoning models ("o" series), use the `reasoning_model=True` flag. The `max_completion_tokens` will be used instead of the `max_tokens`. `temperature` will be ignored.
|
|
331
|
+
|
|
332
|
+
```python
|
|
333
|
+
from llm_ie.engines import OpenAIInferenceEngine
|
|
334
|
+
|
|
335
|
+
inference_engine = OpenAIInferenceEngine(model="o1-mini", reasoning_model=True)
|
|
336
|
+
```
|
|
337
|
+
|
|
326
338
|
#### <img src=doc_asset/readme_img/Azure_icon.png width=32 /> Azure OpenAI API
|
|
327
339
|
In bash, save the endpoint name and API key to environmental variables `AZURE_OPENAI_ENDPOINT` and `AZURE_OPENAI_API_KEY`.
|
|
328
340
|
```
|
|
@@ -339,6 +351,14 @@ from llm_ie.engines import AzureOpenAIInferenceEngine
|
|
|
339
351
|
inference_engine = AzureOpenAIInferenceEngine(model="gpt-4o-mini")
|
|
340
352
|
```
|
|
341
353
|
|
|
354
|
+
For reasoning models ("o" series), use the `reasoning_model=True` flag. The `max_completion_tokens` will be used instead of the `max_tokens`. `temperature` will be ignored.
|
|
355
|
+
|
|
356
|
+
```python
|
|
357
|
+
from llm_ie.engines import AzureOpenAIInferenceEngine
|
|
358
|
+
|
|
359
|
+
inference_engine = AzureOpenAIInferenceEngine(model="o1-mini", reasoning_model=True)
|
|
360
|
+
```
|
|
361
|
+
|
|
342
362
|
#### 🤗 huggingface_hub
|
|
343
363
|
The ```model``` can be a model id hosted on the Hugging Face Hub or a URL to a deployed Inference Endpoint. Refer to the [Inference Client](https://huggingface.co/docs/huggingface_hub/en/package_reference/inference_client) documentation for more details.
|
|
344
364
|
|
|
@@ -766,7 +786,7 @@ frames = extractor.extract_frames(text_content=text, entity_key="Diagnosis", str
|
|
|
766
786
|
|
|
767
787
|
The ```SentenceFrameExtractor``` instructs the LLM to extract sentence by sentence. The reason is to ensure the accuracy of frame spans. It also prevents LLMs from overseeing sections/ sentences. Empirically, this extractor results in better recall than the ```BasicFrameExtractor``` in complex tasks.
|
|
768
788
|
|
|
769
|
-
For concurrent extraction (recommended), the `async/
|
|
789
|
+
For concurrent extraction (recommended), the `async/await` feature is used to speed up inferencing. The `concurrent_batch_size` sets the batch size of sentences to be processed in cocurrent.
|
|
770
790
|
|
|
771
791
|
```python
|
|
772
792
|
from llm_ie.extractors import SentenceFrameExtractor
|
|
@@ -775,15 +795,29 @@ extractor = SentenceFrameExtractor(inference_engine, prompt_temp)
|
|
|
775
795
|
frames = extractor.extract_frames(text_content=text, entity_key="Diagnosis", case_sensitive=False, fuzzy_match=True, concurrent=True, concurrent_batch_size=32)
|
|
776
796
|
```
|
|
777
797
|
|
|
778
|
-
The
|
|
798
|
+
The `context_sentences` sets number of sentences before and after the sentence of interest to provide additional context. When `context_sentences=2`, 2 sentences before and 2 sentences after are included in the user prompt as context. When `context_sentences="all"`, the entire document is included as context. When `context_sentences=0`, no context is provided and LLM will only extract based on the current sentence of interest.
|
|
779
799
|
|
|
780
800
|
```python
|
|
781
801
|
from llm_ie.extractors import SentenceFrameExtractor
|
|
782
802
|
|
|
783
|
-
extractor = SentenceFrameExtractor(inference_engine,
|
|
784
|
-
|
|
803
|
+
extractor = SentenceFrameExtractor(inference_engine=inference_engine,
|
|
804
|
+
prompt_template=prompt_temp,
|
|
805
|
+
context_sentences=2)
|
|
806
|
+
frames = extractor.extract_frames(text_content=text, entity_key="Diagnosis", case_sensitive=False, fuzzy_match=True, stream=True)
|
|
785
807
|
```
|
|
786
808
|
|
|
809
|
+
For the sentence:
|
|
810
|
+
|
|
811
|
+
*The patient has a history of hypertension, hyperlipidemia, and Type 2 diabetes mellitus.*
|
|
812
|
+
|
|
813
|
+
The context is "previous sentence 2" "previous sentence 1" "the sentence of interest" "proceeding sentence 1" "proceeding sentence 2":
|
|
814
|
+
|
|
815
|
+
*Emily Brown, MD (Cardiology), Dr. Michael Green, MD (Pulmonology)
|
|
816
|
+
|
|
817
|
+
*#### Reason for Admission*
|
|
818
|
+
*John Doe, a 49-year-old male, was admitted to the hospital with complaints of chest pain, shortness of breath, and dizziness. The patient has a history of hypertension, hyperlipidemia, and Type 2 diabetes mellitus. #### History of Present Illness*
|
|
819
|
+
*The patient reported that the chest pain started two days prior to admission. The pain was described as a pressure-like sensation in the central chest, radiating to the left arm and jaw.*
|
|
820
|
+
|
|
787
821
|
</details>
|
|
788
822
|
|
|
789
823
|
<details>
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "llm-ie"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.6"
|
|
4
4
|
description = "An LLM-powered tool that transforms everyday language into robust information extraction pipelines."
|
|
5
5
|
authors = ["Enshuo (David) Hsu"]
|
|
6
6
|
license = "MIT"
|
|
7
7
|
readme = "README.md"
|
|
8
8
|
|
|
9
9
|
exclude = [
|
|
10
|
-
"test/**"
|
|
10
|
+
"test/**",
|
|
11
|
+
"develop/**"
|
|
11
12
|
]
|
|
12
13
|
|
|
13
14
|
|
|
@@ -61,8 +61,8 @@ Example 1 (single entity type with attributes):
|
|
|
61
61
|
If there is no specific arm, just omit the "Arm" key. If the percentage is not reported, just omit the "Percentage" key. The "Evidence" should always be provided.
|
|
62
62
|
|
|
63
63
|
# Input placeholder
|
|
64
|
-
Below is the Adverse reactions section
|
|
65
|
-
{{input}}
|
|
64
|
+
Below is the Adverse reactions section:
|
|
65
|
+
"{{input}}"
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
Example 2 (multiple entity types):
|
|
@@ -121,7 +121,7 @@ Example 2 (multiple entity types):
|
|
|
121
121
|
</Outputs>
|
|
122
122
|
|
|
123
123
|
# Input placeholder
|
|
124
|
-
Below is the medical note
|
|
124
|
+
Below is the medical note:
|
|
125
125
|
"{{input}}"
|
|
126
126
|
|
|
127
127
|
|
|
@@ -213,5 +213,5 @@ Example 3 (multiple entity types with corresponding attributes):
|
|
|
213
213
|
</Outputs>
|
|
214
214
|
|
|
215
215
|
# Input placeholder
|
|
216
|
-
Below is the
|
|
216
|
+
Below is the medical note:
|
|
217
217
|
"{{input}}"
|
{llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt
RENAMED
|
@@ -46,8 +46,8 @@ Example 1 (single entity type with attributes):
|
|
|
46
46
|
If there is no specific arm, just omit the "Arm" key. If the percentage is not reported, just omit the "Percentage" key. The "Evidence" should always be provided.
|
|
47
47
|
|
|
48
48
|
# Input placeholder
|
|
49
|
-
Below is the Adverse reactions section
|
|
50
|
-
{{input}}
|
|
49
|
+
Below is the Adverse reactions section:
|
|
50
|
+
"{{input}}"
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
Example 2 (multiple entity types):
|
|
@@ -81,7 +81,7 @@ Example 2 (multiple entity types):
|
|
|
81
81
|
|
|
82
82
|
|
|
83
83
|
# Input placeholder
|
|
84
|
-
Below is the medical note
|
|
84
|
+
Below is the medical note:
|
|
85
85
|
"{{input}}"
|
|
86
86
|
|
|
87
87
|
|
|
@@ -141,5 +141,5 @@ Example 3 (multiple entity types with corresponding attributes):
|
|
|
141
141
|
|
|
142
142
|
|
|
143
143
|
# Input placeholder
|
|
144
|
-
Below is the
|
|
144
|
+
Below is the medical note:
|
|
145
145
|
"{{input}}"
|
|
@@ -46,8 +46,8 @@ Example 1 (single entity type with attributes):
|
|
|
46
46
|
If there is no specific arm, just omit the "Arm" key. If the percentage is not reported, just omit the "Percentage" key. The "Evidence" should always be provided.
|
|
47
47
|
|
|
48
48
|
# Input placeholder
|
|
49
|
-
Below is the Adverse reactions section
|
|
50
|
-
{{input}}
|
|
49
|
+
Below is the Adverse reactions section:
|
|
50
|
+
"{{input}}"
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
Example 2 (multiple entity types):
|
|
@@ -81,7 +81,7 @@ Example 2 (multiple entity types):
|
|
|
81
81
|
|
|
82
82
|
|
|
83
83
|
# Input placeholder
|
|
84
|
-
Below is the medical note
|
|
84
|
+
Below is the medical note:
|
|
85
85
|
"{{input}}"
|
|
86
86
|
|
|
87
87
|
|
|
@@ -141,5 +141,5 @@ Example 3 (multiple entity types with corresponding attributes):
|
|
|
141
141
|
|
|
142
142
|
|
|
143
143
|
# Input placeholder
|
|
144
|
-
Below is the
|
|
144
|
+
Below is the medical note:
|
|
145
145
|
"{{input}}"
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import abc
|
|
2
|
+
import warnings
|
|
2
3
|
import importlib
|
|
3
4
|
from typing import List, Dict, Union
|
|
4
5
|
|
|
@@ -242,7 +243,7 @@ class HuggingFaceHubInferenceEngine(InferenceEngine):
|
|
|
242
243
|
|
|
243
244
|
|
|
244
245
|
class OpenAIInferenceEngine(InferenceEngine):
|
|
245
|
-
def __init__(self, model:str, **kwrs):
|
|
246
|
+
def __init__(self, model:str, reasoning_model:bool=False, **kwrs):
|
|
246
247
|
"""
|
|
247
248
|
The OpenAI API inference engine. Supports OpenAI models and OpenAI compatible servers:
|
|
248
249
|
- vLLM OpenAI compatible server (https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html)
|
|
@@ -254,6 +255,8 @@ class OpenAIInferenceEngine(InferenceEngine):
|
|
|
254
255
|
----------
|
|
255
256
|
model_name : str
|
|
256
257
|
model name as described in https://platform.openai.com/docs/models
|
|
258
|
+
reasoning_model : bool, Optional
|
|
259
|
+
indicator for OpenAI reasoning models ("o" series).
|
|
257
260
|
"""
|
|
258
261
|
if importlib.util.find_spec("openai") is None:
|
|
259
262
|
raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")
|
|
@@ -262,6 +265,7 @@ class OpenAIInferenceEngine(InferenceEngine):
|
|
|
262
265
|
self.client = OpenAI(**kwrs)
|
|
263
266
|
self.async_client = AsyncOpenAI(**kwrs)
|
|
264
267
|
self.model = model
|
|
268
|
+
self.reasoning_model = reasoning_model
|
|
265
269
|
|
|
266
270
|
def chat(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0, stream:bool=False, **kwrs) -> str:
|
|
267
271
|
"""
|
|
@@ -278,14 +282,27 @@ class OpenAIInferenceEngine(InferenceEngine):
|
|
|
278
282
|
stream : bool, Optional
|
|
279
283
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
280
284
|
"""
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
285
|
+
if self.reasoning_model:
|
|
286
|
+
if temperature != 0.0:
|
|
287
|
+
warnings.warn("Reasoning models do not support temperature parameter. Will be ignored.", UserWarning)
|
|
288
|
+
|
|
289
|
+
response = self.client.chat.completions.create(
|
|
290
|
+
model=self.model,
|
|
291
|
+
messages=messages,
|
|
292
|
+
max_completion_tokens=max_new_tokens,
|
|
293
|
+
stream=stream,
|
|
294
|
+
**kwrs
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
else:
|
|
298
|
+
response = self.client.chat.completions.create(
|
|
299
|
+
model=self.model,
|
|
300
|
+
messages=messages,
|
|
301
|
+
max_tokens=max_new_tokens,
|
|
302
|
+
temperature=temperature,
|
|
303
|
+
stream=stream,
|
|
304
|
+
**kwrs
|
|
305
|
+
)
|
|
289
306
|
|
|
290
307
|
if stream:
|
|
291
308
|
res = ''
|
|
@@ -294,8 +311,17 @@ class OpenAIInferenceEngine(InferenceEngine):
|
|
|
294
311
|
if chunk.choices[0].delta.content is not None:
|
|
295
312
|
res += chunk.choices[0].delta.content
|
|
296
313
|
print(chunk.choices[0].delta.content, end="", flush=True)
|
|
314
|
+
if chunk.choices[0].finish_reason == "length":
|
|
315
|
+
warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
|
|
316
|
+
if self.reasoning_model:
|
|
317
|
+
warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
|
|
297
318
|
return res
|
|
298
319
|
|
|
320
|
+
if response.choices[0].finish_reason == "length":
|
|
321
|
+
warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
|
|
322
|
+
if self.reasoning_model:
|
|
323
|
+
warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
|
|
324
|
+
|
|
299
325
|
return response.choices[0].message.content
|
|
300
326
|
|
|
301
327
|
|
|
@@ -303,20 +329,37 @@ class OpenAIInferenceEngine(InferenceEngine):
|
|
|
303
329
|
"""
|
|
304
330
|
Async version of chat method. Streaming is not supported.
|
|
305
331
|
"""
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
332
|
+
if self.reasoning_model:
|
|
333
|
+
if temperature != 0.0:
|
|
334
|
+
warnings.warn("Reasoning models do not support temperature parameter. Will be ignored.", UserWarning)
|
|
335
|
+
|
|
336
|
+
response = await self.async_client.chat.completions.create(
|
|
337
|
+
model=self.model,
|
|
338
|
+
messages=messages,
|
|
339
|
+
max_completion_tokens=max_new_tokens,
|
|
340
|
+
stream=False,
|
|
341
|
+
**kwrs
|
|
342
|
+
)
|
|
343
|
+
else:
|
|
344
|
+
response = await self.async_client.chat.completions.create(
|
|
345
|
+
model=self.model,
|
|
346
|
+
messages=messages,
|
|
347
|
+
max_tokens=max_new_tokens,
|
|
348
|
+
temperature=temperature,
|
|
349
|
+
stream=False,
|
|
350
|
+
**kwrs
|
|
351
|
+
)
|
|
314
352
|
|
|
353
|
+
if response.choices[0].finish_reason == "length":
|
|
354
|
+
warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
|
|
355
|
+
if self.reasoning_model:
|
|
356
|
+
warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
|
|
357
|
+
|
|
315
358
|
return response.choices[0].message.content
|
|
316
359
|
|
|
317
360
|
|
|
318
361
|
class AzureOpenAIInferenceEngine(InferenceEngine):
|
|
319
|
-
def __init__(self, model:str, api_version:str, **kwrs):
|
|
362
|
+
def __init__(self, model:str, api_version:str, reasoning_model:bool=False, **kwrs):
|
|
320
363
|
"""
|
|
321
364
|
The Azure OpenAI API inference engine.
|
|
322
365
|
For parameters and documentation, refer to
|
|
@@ -329,6 +372,8 @@ class AzureOpenAIInferenceEngine(InferenceEngine):
|
|
|
329
372
|
model name as described in https://platform.openai.com/docs/models
|
|
330
373
|
api_version : str
|
|
331
374
|
the Azure OpenAI API version
|
|
375
|
+
reasoning_model : bool, Optional
|
|
376
|
+
indicator for OpenAI reasoning models ("o" series).
|
|
332
377
|
"""
|
|
333
378
|
if importlib.util.find_spec("openai") is None:
|
|
334
379
|
raise ImportError("OpenAI Python API library not found. Please install OpanAI (```pip install openai```).")
|
|
@@ -340,6 +385,7 @@ class AzureOpenAIInferenceEngine(InferenceEngine):
|
|
|
340
385
|
**kwrs)
|
|
341
386
|
self.async_client = AsyncAzureOpenAI(api_version=self.api_version,
|
|
342
387
|
**kwrs)
|
|
388
|
+
self.reasoning_model = reasoning_model
|
|
343
389
|
|
|
344
390
|
def chat(self, messages:List[Dict[str,str]], max_new_tokens:int=2048, temperature:float=0.0, stream:bool=False, **kwrs) -> str:
|
|
345
391
|
"""
|
|
@@ -356,14 +402,27 @@ class AzureOpenAIInferenceEngine(InferenceEngine):
|
|
|
356
402
|
stream : bool, Optional
|
|
357
403
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
358
404
|
"""
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
405
|
+
if self.reasoning_model:
|
|
406
|
+
if temperature != 0.0:
|
|
407
|
+
warnings.warn("Reasoning models do not support temperature parameter. Will be ignored.", UserWarning)
|
|
408
|
+
|
|
409
|
+
response = self.client.chat.completions.create(
|
|
410
|
+
model=self.model,
|
|
411
|
+
messages=messages,
|
|
412
|
+
max_completion_tokens=max_new_tokens,
|
|
413
|
+
stream=stream,
|
|
414
|
+
**kwrs
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
else:
|
|
418
|
+
response = self.client.chat.completions.create(
|
|
419
|
+
model=self.model,
|
|
420
|
+
messages=messages,
|
|
421
|
+
max_tokens=max_new_tokens,
|
|
422
|
+
temperature=temperature,
|
|
423
|
+
stream=stream,
|
|
424
|
+
**kwrs
|
|
425
|
+
)
|
|
367
426
|
|
|
368
427
|
if stream:
|
|
369
428
|
res = ''
|
|
@@ -372,8 +431,17 @@ class AzureOpenAIInferenceEngine(InferenceEngine):
|
|
|
372
431
|
if chunk.choices[0].delta.content is not None:
|
|
373
432
|
res += chunk.choices[0].delta.content
|
|
374
433
|
print(chunk.choices[0].delta.content, end="", flush=True)
|
|
434
|
+
if chunk.choices[0].finish_reason == "length":
|
|
435
|
+
warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
|
|
436
|
+
if self.reasoning_model:
|
|
437
|
+
warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
|
|
375
438
|
return res
|
|
376
439
|
|
|
440
|
+
if response.choices[0].finish_reason == "length":
|
|
441
|
+
warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
|
|
442
|
+
if self.reasoning_model:
|
|
443
|
+
warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
|
|
444
|
+
|
|
377
445
|
return response.choices[0].message.content
|
|
378
446
|
|
|
379
447
|
|
|
@@ -381,15 +449,32 @@ class AzureOpenAIInferenceEngine(InferenceEngine):
|
|
|
381
449
|
"""
|
|
382
450
|
Async version of chat method. Streaming is not supported.
|
|
383
451
|
"""
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
452
|
+
if self.reasoning_model:
|
|
453
|
+
if temperature != 0.0:
|
|
454
|
+
warnings.warn("Reasoning models do not support temperature parameter. Will be ignored.", UserWarning)
|
|
455
|
+
|
|
456
|
+
response = await self.async_client.chat.completions.create(
|
|
457
|
+
model=self.model,
|
|
458
|
+
messages=messages,
|
|
459
|
+
max_completion_tokens=max_new_tokens,
|
|
460
|
+
stream=False,
|
|
461
|
+
**kwrs
|
|
462
|
+
)
|
|
463
|
+
else:
|
|
464
|
+
response = await self.async_client.chat.completions.create(
|
|
465
|
+
model=self.model,
|
|
466
|
+
messages=messages,
|
|
467
|
+
max_tokens=max_new_tokens,
|
|
468
|
+
temperature=temperature,
|
|
469
|
+
stream=False,
|
|
470
|
+
**kwrs
|
|
471
|
+
)
|
|
392
472
|
|
|
473
|
+
if response.choices[0].finish_reason == "length":
|
|
474
|
+
warnings.warn("Model stopped generating due to context length limit.", RuntimeWarning)
|
|
475
|
+
if self.reasoning_model:
|
|
476
|
+
warnings.warn("max_new_tokens includes reasoning tokens and output tokens.", UserWarning)
|
|
477
|
+
|
|
393
478
|
return response.choices[0].message.content
|
|
394
479
|
|
|
395
480
|
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
import re
|
|
3
|
-
import copy
|
|
4
3
|
import json
|
|
5
4
|
import json_repair
|
|
6
5
|
import inspect
|
|
@@ -13,7 +12,6 @@ from typing import Set, List, Dict, Tuple, Union, Callable
|
|
|
13
12
|
from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
14
13
|
from llm_ie.engines import InferenceEngine
|
|
15
14
|
from colorama import Fore, Style
|
|
16
|
-
from nltk.tokenize import RegexpTokenizer
|
|
17
15
|
|
|
18
16
|
|
|
19
17
|
class Extractor:
|
|
@@ -139,6 +137,7 @@ class Extractor:
|
|
|
139
137
|
|
|
140
138
|
|
|
141
139
|
class FrameExtractor(Extractor):
|
|
140
|
+
from nltk.tokenize import RegexpTokenizer
|
|
142
141
|
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None, **kwrs):
|
|
143
142
|
"""
|
|
144
143
|
This is the abstract class for frame extraction.
|
|
@@ -157,7 +156,8 @@ class FrameExtractor(Extractor):
|
|
|
157
156
|
prompt_template=prompt_template,
|
|
158
157
|
system_prompt=system_prompt,
|
|
159
158
|
**kwrs)
|
|
160
|
-
|
|
159
|
+
|
|
160
|
+
self.tokenizer = self.RegexpTokenizer(r'\w+|[^\w\s]')
|
|
161
161
|
|
|
162
162
|
|
|
163
163
|
def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
|
|
@@ -224,7 +224,8 @@ class FrameExtractor(Extractor):
|
|
|
224
224
|
|
|
225
225
|
|
|
226
226
|
def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
|
|
227
|
-
fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8
|
|
227
|
+
fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
228
|
+
allow_overlap_entities:bool=False) -> List[Tuple[int]]:
|
|
228
229
|
"""
|
|
229
230
|
This function inputs a text and a list of entity text,
|
|
230
231
|
outputs a list of spans (2-tuple) for each entity.
|
|
@@ -245,6 +246,8 @@ class FrameExtractor(Extractor):
|
|
|
245
246
|
fuzzy_score_cutoff : float, Optional
|
|
246
247
|
the Jaccard score cutoff for fuzzy matching.
|
|
247
248
|
Matched entity text must have a score higher than this value or a None will be returned.
|
|
249
|
+
allow_overlap_entities : bool, Optional
|
|
250
|
+
if True, entities can overlap in the text.
|
|
248
251
|
"""
|
|
249
252
|
# Handle case sensitivity
|
|
250
253
|
if not case_sensitive:
|
|
@@ -264,15 +267,17 @@ class FrameExtractor(Extractor):
|
|
|
264
267
|
if match and entity:
|
|
265
268
|
start, end = match.span()
|
|
266
269
|
entity_spans.append((start, end))
|
|
267
|
-
|
|
268
|
-
|
|
270
|
+
if not allow_overlap_entities:
|
|
271
|
+
# Replace the found entity with spaces to avoid finding the same instance again
|
|
272
|
+
text = text[:start] + ' ' * (end - start) + text[end:]
|
|
269
273
|
# Fuzzy match
|
|
270
274
|
elif fuzzy_match:
|
|
271
275
|
closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
|
|
272
276
|
if closest_substring_span and best_score >= fuzzy_score_cutoff:
|
|
273
277
|
entity_spans.append(closest_substring_span)
|
|
274
|
-
|
|
275
|
-
|
|
278
|
+
if not allow_overlap_entities:
|
|
279
|
+
# Replace the found entity with spaces to avoid finding the same instance again
|
|
280
|
+
text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
|
|
276
281
|
else:
|
|
277
282
|
entity_spans.append(None)
|
|
278
283
|
|
|
@@ -391,7 +396,7 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
391
396
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
|
|
392
397
|
temperature:float=0.0, document_key:str=None, stream:bool=False,
|
|
393
398
|
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
|
|
394
|
-
fuzzy_score_cutoff:float=0.8, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
399
|
+
fuzzy_score_cutoff:float=0.8, allow_overlap_entities:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
395
400
|
"""
|
|
396
401
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
397
402
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -422,6 +427,9 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
422
427
|
fuzzy_score_cutoff : float, Optional
|
|
423
428
|
the Jaccard score cutoff for fuzzy matching.
|
|
424
429
|
Matched entity text must have a score higher than this value or a None will be returned.
|
|
430
|
+
allow_overlap_entities : bool, Optional
|
|
431
|
+
if True, entities can overlap in the text.
|
|
432
|
+
Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
|
|
425
433
|
|
|
426
434
|
Return : str
|
|
427
435
|
a list of frames.
|
|
@@ -452,7 +460,8 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
452
460
|
case_sensitive=case_sensitive,
|
|
453
461
|
fuzzy_match=fuzzy_match,
|
|
454
462
|
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
455
|
-
fuzzy_score_cutoff=fuzzy_score_cutoff
|
|
463
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff,
|
|
464
|
+
allow_overlap_entities=allow_overlap_entities)
|
|
456
465
|
|
|
457
466
|
for i, (ent, span) in enumerate(zip(entity_json, spans)):
|
|
458
467
|
if span is not None:
|
|
@@ -569,7 +578,8 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
569
578
|
|
|
570
579
|
class SentenceFrameExtractor(FrameExtractor):
|
|
571
580
|
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
|
572
|
-
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
|
|
581
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
|
|
582
|
+
context_sentences:Union[str, int]="all", **kwrs):
|
|
573
583
|
"""
|
|
574
584
|
This class performs sentence-by-sentence information extraction.
|
|
575
585
|
The process is as follows:
|
|
@@ -590,10 +600,26 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
590
600
|
prompt template with "{{<placeholder name>}}" placeholder.
|
|
591
601
|
system_prompt : str, Optional
|
|
592
602
|
system prompt.
|
|
603
|
+
context_sentences : Union[str, int], Optional
|
|
604
|
+
number of sentences before and after the given sentence to provide additional context.
|
|
605
|
+
if "all", the full text will be provided in the prompt as context.
|
|
606
|
+
if 0, no additional context will be provided.
|
|
607
|
+
This is good for tasks that does not require context beyond the given sentence.
|
|
608
|
+
if > 0, the number of sentences before and after the given sentence to provide as context.
|
|
609
|
+
This is good for tasks that require context beyond the given sentence.
|
|
593
610
|
"""
|
|
594
611
|
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
595
612
|
system_prompt=system_prompt, **kwrs)
|
|
596
613
|
|
|
614
|
+
if not isinstance(context_sentences, int) and context_sentences != "all":
|
|
615
|
+
raise ValueError('context_sentences must be an integer (>= 0) or "all".')
|
|
616
|
+
|
|
617
|
+
if isinstance(context_sentences, int) and context_sentences < 0:
|
|
618
|
+
raise ValueError("context_sentences must be a positive integer.")
|
|
619
|
+
|
|
620
|
+
self.context_sentences =context_sentences
|
|
621
|
+
|
|
622
|
+
|
|
597
623
|
def _get_sentences(self, text:str) -> List[Dict[str,str]]:
|
|
598
624
|
"""
|
|
599
625
|
This method sentence tokenize the input text into a list of sentences
|
|
@@ -614,9 +640,24 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
614
640
|
"end": end})
|
|
615
641
|
return sentences
|
|
616
642
|
|
|
643
|
+
|
|
644
|
+
def _get_context_sentences(self, text_content, i:int, sentences:List[Dict[str, str]], document_key:str=None) -> str:
|
|
645
|
+
"""
|
|
646
|
+
This function returns the context sentences for the current sentence of interest (i).
|
|
647
|
+
"""
|
|
648
|
+
if self.context_sentences == "all":
|
|
649
|
+
context = text_content if isinstance(text_content, str) else text_content[document_key]
|
|
650
|
+
elif self.context_sentences == 0:
|
|
651
|
+
context = ""
|
|
652
|
+
else:
|
|
653
|
+
start = max(0, i - self.context_sentences)
|
|
654
|
+
end = min(i + 1 + self.context_sentences, len(sentences))
|
|
655
|
+
context = " ".join([s['sentence_text'] for s in sentences[start:end]])
|
|
656
|
+
return context
|
|
657
|
+
|
|
617
658
|
|
|
618
659
|
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
619
|
-
document_key:str=None,
|
|
660
|
+
document_key:str=None, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
620
661
|
"""
|
|
621
662
|
This method inputs a text and outputs a list of outputs per sentence.
|
|
622
663
|
|
|
@@ -631,12 +672,6 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
631
672
|
document_key : str, Optional
|
|
632
673
|
specify the key in text_content where document text is.
|
|
633
674
|
If text_content is str, this parameter will be ignored.
|
|
634
|
-
multi_turn : bool, Optional
|
|
635
|
-
multi-turn conversation prompting.
|
|
636
|
-
If True, sentences and LLM outputs will be appended to the input message and carry-over.
|
|
637
|
-
If False, only the current sentence is prompted.
|
|
638
|
-
For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
|
|
639
|
-
can better utilize the KV caching.
|
|
640
675
|
temperature : float, Optional
|
|
641
676
|
the temperature for token sampling.
|
|
642
677
|
stream : bool, Optional
|
|
@@ -654,19 +689,32 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
654
689
|
if document_key is None:
|
|
655
690
|
raise ValueError("document_key must be provided when text_content is dict.")
|
|
656
691
|
sentences = self._get_sentences(text_content[document_key])
|
|
657
|
-
# construct chat messages
|
|
658
|
-
messages = []
|
|
659
|
-
if self.system_prompt:
|
|
660
|
-
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
661
|
-
|
|
662
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
|
|
663
|
-
messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
|
|
664
692
|
|
|
665
693
|
# generate sentence by sentence
|
|
666
|
-
for sent in sentences:
|
|
667
|
-
|
|
694
|
+
for i, sent in enumerate(sentences):
|
|
695
|
+
# construct chat messages
|
|
696
|
+
messages = []
|
|
697
|
+
if self.system_prompt:
|
|
698
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
699
|
+
|
|
700
|
+
context = self._get_context_sentences(text_content, i, sentences, document_key)
|
|
701
|
+
|
|
702
|
+
if self.context_sentences == 0:
|
|
703
|
+
# no context, just place sentence of interest
|
|
704
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
705
|
+
else:
|
|
706
|
+
# insert context
|
|
707
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
708
|
+
# simulate conversation
|
|
709
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
710
|
+
# place sentence of interest
|
|
711
|
+
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
712
|
+
|
|
668
713
|
if stream:
|
|
669
|
-
print(f"\n\n{Fore.GREEN}Sentence
|
|
714
|
+
print(f"\n\n{Fore.GREEN}Sentence {i}:{Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
715
|
+
if isinstance(self.context_sentences, int) and self.context_sentences > 0:
|
|
716
|
+
print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
|
|
717
|
+
|
|
670
718
|
print(f"{Fore.BLUE}Extraction:{Style.RESET_ALL}")
|
|
671
719
|
|
|
672
720
|
gen_text = self.inference_engine.chat(
|
|
@@ -676,19 +724,13 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
676
724
|
stream=stream,
|
|
677
725
|
**kwrs
|
|
678
726
|
)
|
|
679
|
-
|
|
680
|
-
if multi_turn:
|
|
681
|
-
# update chat messages with LLM outputs
|
|
682
|
-
messages.append({'role': 'assistant', 'content': gen_text})
|
|
683
|
-
else:
|
|
684
|
-
# delete sentence so that message is reset
|
|
685
|
-
del messages[-1]
|
|
686
727
|
|
|
687
728
|
# add to output
|
|
688
729
|
output.append({'sentence_start': sent['start'],
|
|
689
730
|
'sentence_end': sent['end'],
|
|
690
731
|
'sentence_text': sent['sentence_text'],
|
|
691
732
|
'gen_text': gen_text})
|
|
733
|
+
|
|
692
734
|
return output
|
|
693
735
|
|
|
694
736
|
|
|
@@ -726,21 +768,31 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
726
768
|
if document_key is None:
|
|
727
769
|
raise ValueError("document_key must be provided when text_content is dict.")
|
|
728
770
|
sentences = self._get_sentences(text_content[document_key])
|
|
729
|
-
# construct chat messages
|
|
730
|
-
base_messages = []
|
|
731
|
-
if self.system_prompt:
|
|
732
|
-
base_messages.append({'role': 'system', 'content': self.system_prompt})
|
|
733
|
-
|
|
734
|
-
base_messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
|
|
735
|
-
base_messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
|
|
736
771
|
|
|
737
772
|
# generate sentence by sentence
|
|
738
|
-
tasks = []
|
|
739
773
|
for i in range(0, len(sentences), concurrent_batch_size):
|
|
774
|
+
tasks = []
|
|
740
775
|
batch = sentences[i:i + concurrent_batch_size]
|
|
741
|
-
for sent in batch:
|
|
742
|
-
|
|
743
|
-
messages
|
|
776
|
+
for j, sent in enumerate(batch):
|
|
777
|
+
# construct chat messages
|
|
778
|
+
messages = []
|
|
779
|
+
if self.system_prompt:
|
|
780
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
781
|
+
|
|
782
|
+
context = self._get_context_sentences(text_content, i + j, sentences, document_key)
|
|
783
|
+
|
|
784
|
+
if self.context_sentences == 0:
|
|
785
|
+
# no context, just place sentence of interest
|
|
786
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
787
|
+
else:
|
|
788
|
+
# insert context
|
|
789
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
790
|
+
# simulate conversation
|
|
791
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
792
|
+
# place sentence of interest
|
|
793
|
+
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
794
|
+
|
|
795
|
+
# add to tasks
|
|
744
796
|
task = asyncio.create_task(
|
|
745
797
|
self.inference_engine.chat_async(
|
|
746
798
|
messages=messages,
|
|
@@ -754,20 +806,20 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
754
806
|
# Wait until the batch is done, collect results and move on to next batch
|
|
755
807
|
responses = await asyncio.gather(*tasks)
|
|
756
808
|
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
809
|
+
# Collect outputs
|
|
810
|
+
for gen_text, sent in zip(responses, batch):
|
|
811
|
+
output.append({'sentence_start': sent['start'],
|
|
812
|
+
'sentence_end': sent['end'],
|
|
813
|
+
'sentence_text': sent['sentence_text'],
|
|
814
|
+
'gen_text': gen_text})
|
|
763
815
|
return output
|
|
764
816
|
|
|
765
817
|
|
|
766
818
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
819
|
+
document_key:str=None, temperature:float=0.0, stream:bool=False,
|
|
820
|
+
concurrent:bool=False, concurrent_batch_size:int=32,
|
|
821
|
+
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
822
|
+
allow_overlap_entities:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
771
823
|
"""
|
|
772
824
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
773
825
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -785,12 +837,6 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
785
837
|
document_key : str, Optional
|
|
786
838
|
specify the key in text_content where document text is.
|
|
787
839
|
If text_content is str, this parameter will be ignored.
|
|
788
|
-
multi_turn : bool, Optional
|
|
789
|
-
multi-turn conversation prompting.
|
|
790
|
-
If True, sentences and LLM outputs will be appended to the input message and carry-over.
|
|
791
|
-
If False, only the current sentence is prompted.
|
|
792
|
-
For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
|
|
793
|
-
can better utilize the KV caching.
|
|
794
840
|
temperature : float, Optional
|
|
795
841
|
the temperature for token sampling.
|
|
796
842
|
stream : bool, Optional
|
|
@@ -808,6 +854,9 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
808
854
|
fuzzy_score_cutoff : float, Optional
|
|
809
855
|
the Jaccard score cutoff for fuzzy matching.
|
|
810
856
|
Matched entity text must have a score higher than this value or a None will be returned.
|
|
857
|
+
allow_overlap_entities : bool, Optional
|
|
858
|
+
if True, entities can overlap in the text.
|
|
859
|
+
Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
|
|
811
860
|
|
|
812
861
|
Return : str
|
|
813
862
|
a list of frames.
|
|
@@ -815,8 +864,6 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
815
864
|
if concurrent:
|
|
816
865
|
if stream:
|
|
817
866
|
warnings.warn("stream=True is not supported in concurrent mode.", RuntimeWarning)
|
|
818
|
-
if multi_turn:
|
|
819
|
-
warnings.warn("multi_turn=True is not supported in concurrent mode.", RuntimeWarning)
|
|
820
867
|
|
|
821
868
|
nest_asyncio.apply() # For Jupyter notebook. Terminal does not need this.
|
|
822
869
|
llm_output_sentences = asyncio.run(self.extract_async(text_content=text_content,
|
|
@@ -830,7 +877,6 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
830
877
|
llm_output_sentences = self.extract(text_content=text_content,
|
|
831
878
|
max_new_tokens=max_new_tokens,
|
|
832
879
|
document_key=document_key,
|
|
833
|
-
multi_turn=multi_turn,
|
|
834
880
|
temperature=temperature,
|
|
835
881
|
stream=stream,
|
|
836
882
|
**kwrs)
|
|
@@ -848,7 +894,8 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
848
894
|
case_sensitive=case_sensitive,
|
|
849
895
|
fuzzy_match=fuzzy_match,
|
|
850
896
|
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
851
|
-
fuzzy_score_cutoff=fuzzy_score_cutoff
|
|
897
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff,
|
|
898
|
+
allow_overlap_entities=allow_overlap_entities)
|
|
852
899
|
for ent, span in zip(entity_json, spans):
|
|
853
900
|
if span is not None:
|
|
854
901
|
start, end = span
|
|
@@ -866,7 +913,8 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
866
913
|
|
|
867
914
|
class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
868
915
|
def __init__(self, inference_engine:InferenceEngine, prompt_template:str,
|
|
869
|
-
review_mode:str, review_prompt:str=None, system_prompt:str=None,
|
|
916
|
+
review_mode:str, review_prompt:str=None, system_prompt:str=None,
|
|
917
|
+
context_sentences:Union[str, int]="all", **kwrs):
|
|
870
918
|
"""
|
|
871
919
|
This class adds a review step after the SentenceFrameExtractor.
|
|
872
920
|
For each sentence, the review process asks LLM to review its output and:
|
|
@@ -888,9 +936,16 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
888
936
|
addition mode only ask LLM to add new frames, while revision mode ask LLM to regenerate.
|
|
889
937
|
system_prompt : str, Optional
|
|
890
938
|
system prompt.
|
|
939
|
+
context_sentences : Union[str, int], Optional
|
|
940
|
+
number of sentences before and after the given sentence to provide additional context.
|
|
941
|
+
if "all", the full text will be provided in the prompt as context.
|
|
942
|
+
if 0, no additional context will be provided.
|
|
943
|
+
This is good for tasks that does not require context beyond the given sentence.
|
|
944
|
+
if > 0, the number of sentences before and after the given sentence to provide as context.
|
|
945
|
+
This is good for tasks that require context beyond the given sentence.
|
|
891
946
|
"""
|
|
892
947
|
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
893
|
-
system_prompt=system_prompt, **kwrs)
|
|
948
|
+
system_prompt=system_prompt, context_sentences=context_sentences, **kwrs)
|
|
894
949
|
|
|
895
950
|
if review_mode not in {"addition", "revision"}:
|
|
896
951
|
raise ValueError('review_mode must be one of {"addition", "revision"}.')
|
|
@@ -908,7 +963,7 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
908
963
|
|
|
909
964
|
|
|
910
965
|
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
911
|
-
document_key:str=None,
|
|
966
|
+
document_key:str=None, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
912
967
|
"""
|
|
913
968
|
This method inputs a text and outputs a list of outputs per sentence.
|
|
914
969
|
|
|
@@ -923,12 +978,6 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
923
978
|
document_key : str, Optional
|
|
924
979
|
specify the key in text_content where document text is.
|
|
925
980
|
If text_content is str, this parameter will be ignored.
|
|
926
|
-
multi_turn : bool, Optional
|
|
927
|
-
multi-turn conversation prompting.
|
|
928
|
-
If True, sentences and LLM outputs will be appended to the input message and carry-over.
|
|
929
|
-
If False, only the current sentence is prompted.
|
|
930
|
-
For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
|
|
931
|
-
can better utilize the KV caching.
|
|
932
981
|
temperature : float, Optional
|
|
933
982
|
the temperature for token sampling.
|
|
934
983
|
stream : bool, Optional
|
|
@@ -946,19 +995,31 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
946
995
|
if document_key is None:
|
|
947
996
|
raise ValueError("document_key must be provided when text_content is dict.")
|
|
948
997
|
sentences = self._get_sentences(text_content[document_key])
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
998
|
+
|
|
999
|
+
# generate sentence by sentence
|
|
1000
|
+
for i, sent in enumerate(sentences):
|
|
1001
|
+
# construct chat messages
|
|
1002
|
+
messages = []
|
|
1003
|
+
if self.system_prompt:
|
|
1004
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
953
1005
|
|
|
954
|
-
|
|
955
|
-
|
|
1006
|
+
context = self._get_context_sentences(text_content, i, sentences, document_key)
|
|
1007
|
+
|
|
1008
|
+
if self.context_sentences == 0:
|
|
1009
|
+
# no context, just place sentence of interest
|
|
1010
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
1011
|
+
else:
|
|
1012
|
+
# insert context
|
|
1013
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
1014
|
+
# simulate conversation
|
|
1015
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
1016
|
+
# place sentence of interest
|
|
1017
|
+
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
956
1018
|
|
|
957
|
-
# generate sentence by sentence
|
|
958
|
-
for sent in sentences:
|
|
959
|
-
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
960
1019
|
if stream:
|
|
961
|
-
print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
1020
|
+
print(f"\n\n{Fore.GREEN}Sentence {i}: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
1021
|
+
if isinstance(self.context_sentences, int) and self.context_sentences > 0:
|
|
1022
|
+
print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
|
|
962
1023
|
print(f"{Fore.BLUE}Initial Output:{Style.RESET_ALL}")
|
|
963
1024
|
|
|
964
1025
|
initial = self.inference_engine.chat(
|
|
@@ -988,13 +1049,6 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
988
1049
|
gen_text = review
|
|
989
1050
|
elif self.review_mode == "addition":
|
|
990
1051
|
gen_text = initial + '\n' + review
|
|
991
|
-
|
|
992
|
-
if multi_turn:
|
|
993
|
-
# update chat messages with LLM outputs
|
|
994
|
-
messages.append({'role': 'assistant', 'content': review})
|
|
995
|
-
else:
|
|
996
|
-
# delete sentence and review so that message is reset
|
|
997
|
-
del messages[-3:]
|
|
998
1052
|
|
|
999
1053
|
# add to output
|
|
1000
1054
|
output.append({'sentence_start': sent['start'],
|
|
@@ -1040,24 +1094,34 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1040
1094
|
if document_key is None:
|
|
1041
1095
|
raise ValueError("document_key must be provided when text_content is dict.")
|
|
1042
1096
|
sentences = self._get_sentences(text_content[document_key])
|
|
1043
|
-
# construct chat messages
|
|
1044
|
-
base_messages = []
|
|
1045
|
-
if self.system_prompt:
|
|
1046
|
-
base_messages.append({'role': 'system', 'content': self.system_prompt})
|
|
1047
|
-
|
|
1048
|
-
base_messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
|
|
1049
|
-
base_messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
|
|
1050
1097
|
|
|
1051
1098
|
# generate initial outputs sentence by sentence
|
|
1052
|
-
initials = []
|
|
1053
|
-
tasks = []
|
|
1054
|
-
message_list = []
|
|
1055
1099
|
for i in range(0, len(sentences), concurrent_batch_size):
|
|
1100
|
+
messages_list = []
|
|
1101
|
+
init_tasks = []
|
|
1102
|
+
review_tasks = []
|
|
1056
1103
|
batch = sentences[i:i + concurrent_batch_size]
|
|
1057
|
-
for sent in batch:
|
|
1058
|
-
|
|
1059
|
-
messages
|
|
1060
|
-
|
|
1104
|
+
for j, sent in enumerate(batch):
|
|
1105
|
+
# construct chat messages
|
|
1106
|
+
messages = []
|
|
1107
|
+
if self.system_prompt:
|
|
1108
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
1109
|
+
|
|
1110
|
+
context = self._get_context_sentences(text_content, i + j, sentences, document_key)
|
|
1111
|
+
|
|
1112
|
+
if self.context_sentences == 0:
|
|
1113
|
+
# no context, just place sentence of interest
|
|
1114
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
1115
|
+
else:
|
|
1116
|
+
# insert context
|
|
1117
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
1118
|
+
# simulate conversation
|
|
1119
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
1120
|
+
# place sentence of interest
|
|
1121
|
+
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
1122
|
+
|
|
1123
|
+
messages_list.append(messages)
|
|
1124
|
+
|
|
1061
1125
|
task = asyncio.create_task(
|
|
1062
1126
|
self.inference_engine.chat_async(
|
|
1063
1127
|
messages=messages,
|
|
@@ -1066,24 +1130,21 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1066
1130
|
**kwrs
|
|
1067
1131
|
)
|
|
1068
1132
|
)
|
|
1069
|
-
|
|
1133
|
+
init_tasks.append(task)
|
|
1070
1134
|
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
for i in range(0, len(initials), concurrent_batch_size):
|
|
1085
|
-
batch = initials[i:i + concurrent_batch_size]
|
|
1086
|
-
for init in batch:
|
|
1135
|
+
# Wait until the batch is done, collect results and move on to next batch
|
|
1136
|
+
init_responses = await asyncio.gather(*init_tasks)
|
|
1137
|
+
# Collect initials
|
|
1138
|
+
initials = []
|
|
1139
|
+
for gen_text, sent, messages in zip(init_responses, batch, messages_list):
|
|
1140
|
+
initials.append({'sentence_start': sent['start'],
|
|
1141
|
+
'sentence_end': sent['end'],
|
|
1142
|
+
'sentence_text': sent['sentence_text'],
|
|
1143
|
+
'gen_text': gen_text,
|
|
1144
|
+
'messages': messages})
|
|
1145
|
+
|
|
1146
|
+
# Review
|
|
1147
|
+
for init in initials:
|
|
1087
1148
|
messages = init["messages"]
|
|
1088
1149
|
initial = init["gen_text"]
|
|
1089
1150
|
messages.append({'role': 'assistant', 'content': initial})
|
|
@@ -1096,34 +1157,36 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1096
1157
|
**kwrs
|
|
1097
1158
|
)
|
|
1098
1159
|
)
|
|
1099
|
-
|
|
1160
|
+
review_tasks.append(task)
|
|
1100
1161
|
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1162
|
+
review_responses = await asyncio.gather(*review_tasks)
|
|
1163
|
+
|
|
1164
|
+
# Collect reviews
|
|
1165
|
+
reviews = []
|
|
1166
|
+
for gen_text, sent in zip(review_responses, batch):
|
|
1167
|
+
reviews.append({'sentence_start': sent['start'],
|
|
1168
|
+
'sentence_end': sent['end'],
|
|
1169
|
+
'sentence_text': sent['sentence_text'],
|
|
1170
|
+
'gen_text': gen_text})
|
|
1171
|
+
|
|
1172
|
+
for init, rev in zip(initials, reviews):
|
|
1173
|
+
if self.review_mode == "revision":
|
|
1174
|
+
gen_text = rev['gen_text']
|
|
1175
|
+
elif self.review_mode == "addition":
|
|
1176
|
+
gen_text = init['gen_text'] + '\n' + rev['gen_text']
|
|
1177
|
+
|
|
1178
|
+
# add to output
|
|
1179
|
+
output.append({'sentence_start': init['sentence_start'],
|
|
1180
|
+
'sentence_end': init['sentence_end'],
|
|
1181
|
+
'sentence_text': init['sentence_text'],
|
|
1182
|
+
'gen_text': gen_text})
|
|
1121
1183
|
return output
|
|
1122
1184
|
|
|
1123
1185
|
|
|
1124
1186
|
class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
1125
1187
|
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
|
1126
|
-
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
|
|
1188
|
+
def __init__(self, inference_engine:InferenceEngine, prompt_template:str, system_prompt:str=None,
|
|
1189
|
+
context_sentences:Union[str, int]="all", **kwrs):
|
|
1127
1190
|
"""
|
|
1128
1191
|
This class performs sentence-based Chain-of-thoughts (CoT) information extraction.
|
|
1129
1192
|
A simulated chat follows this process:
|
|
@@ -1145,13 +1208,20 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
|
1145
1208
|
prompt template with "{{<placeholder name>}}" placeholder.
|
|
1146
1209
|
system_prompt : str, Optional
|
|
1147
1210
|
system prompt.
|
|
1211
|
+
context_sentences : Union[str, int], Optional
|
|
1212
|
+
number of sentences before and after the given sentence to provide additional context.
|
|
1213
|
+
if "all", the full text will be provided in the prompt as context.
|
|
1214
|
+
if 0, no additional context will be provided.
|
|
1215
|
+
This is good for tasks that does not require context beyond the given sentence.
|
|
1216
|
+
if > 0, the number of sentences before and after the given sentence to provide as context.
|
|
1217
|
+
This is good for tasks that require context beyond the given sentence.
|
|
1148
1218
|
"""
|
|
1149
1219
|
super().__init__(inference_engine=inference_engine, prompt_template=prompt_template,
|
|
1150
|
-
system_prompt=system_prompt, **kwrs)
|
|
1220
|
+
system_prompt=system_prompt, context_sentences=context_sentences, **kwrs)
|
|
1151
1221
|
|
|
1152
1222
|
|
|
1153
1223
|
def extract(self, text_content:Union[str, Dict[str,str]], max_new_tokens:int=512,
|
|
1154
|
-
document_key:str=None,
|
|
1224
|
+
document_key:str=None, temperature:float=0.0, stream:bool=False, **kwrs) -> List[Dict[str,str]]:
|
|
1155
1225
|
"""
|
|
1156
1226
|
This method inputs a text and outputs a list of outputs per sentence.
|
|
1157
1227
|
|
|
@@ -1166,12 +1236,6 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
|
1166
1236
|
document_key : str, Optional
|
|
1167
1237
|
specify the key in text_content where document text is.
|
|
1168
1238
|
If text_content is str, this parameter will be ignored.
|
|
1169
|
-
multi_turn : bool, Optional
|
|
1170
|
-
multi-turn conversation prompting.
|
|
1171
|
-
If True, sentences and LLM outputs will be appended to the input message and carry-over.
|
|
1172
|
-
If False, only the current sentence is prompted.
|
|
1173
|
-
For LLM inference engines that supports prompt cache (e.g., Llama.Cpp, Ollama), use multi-turn conversation prompting
|
|
1174
|
-
can better utilize the KV caching.
|
|
1175
1239
|
temperature : float, Optional
|
|
1176
1240
|
the temperature for token sampling.
|
|
1177
1241
|
stream : bool, Optional
|
|
@@ -1187,19 +1251,31 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
|
1187
1251
|
sentences = self._get_sentences(text_content)
|
|
1188
1252
|
elif isinstance(text_content, dict):
|
|
1189
1253
|
sentences = self._get_sentences(text_content[document_key])
|
|
1190
|
-
# construct chat messages
|
|
1191
|
-
messages = []
|
|
1192
|
-
if self.system_prompt:
|
|
1193
|
-
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
1194
|
-
|
|
1195
|
-
messages.append({'role': 'user', 'content': self._get_user_prompt(text_content)})
|
|
1196
|
-
messages.append({'role': 'assistant', 'content': 'Sure, please start with the first sentence.'})
|
|
1197
1254
|
|
|
1198
1255
|
# generate sentence by sentence
|
|
1199
|
-
for sent in sentences:
|
|
1200
|
-
|
|
1256
|
+
for i, sent in enumerate(sentences):
|
|
1257
|
+
# construct chat messages
|
|
1258
|
+
messages = []
|
|
1259
|
+
if self.system_prompt:
|
|
1260
|
+
messages.append({'role': 'system', 'content': self.system_prompt})
|
|
1261
|
+
|
|
1262
|
+
context = self._get_context_sentences(text_content, i, sentences, document_key)
|
|
1263
|
+
|
|
1264
|
+
if self.context_sentences == 0:
|
|
1265
|
+
# no context, just place sentence of interest
|
|
1266
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(sent['sentence_text'])})
|
|
1267
|
+
else:
|
|
1268
|
+
# insert context
|
|
1269
|
+
messages.append({'role': 'user', 'content': self._get_user_prompt(context)})
|
|
1270
|
+
# simulate conversation
|
|
1271
|
+
messages.append({'role': 'assistant', 'content': 'Sure, please provide the sentence of interest.'})
|
|
1272
|
+
# place sentence of interest
|
|
1273
|
+
messages.append({'role': 'user', 'content': sent['sentence_text']})
|
|
1274
|
+
|
|
1201
1275
|
if stream:
|
|
1202
1276
|
print(f"\n\n{Fore.GREEN}Sentence: {Style.RESET_ALL}\n{sent['sentence_text']}\n")
|
|
1277
|
+
if isinstance(self.context_sentences, int) and self.context_sentences > 0:
|
|
1278
|
+
print(f"{Fore.YELLOW}Context:{Style.RESET_ALL}\n{context}\n")
|
|
1203
1279
|
print(f"{Fore.BLUE}CoT:{Style.RESET_ALL}")
|
|
1204
1280
|
|
|
1205
1281
|
gen_text = self.inference_engine.chat(
|
|
@@ -1209,13 +1285,6 @@ class SentenceCoTFrameExtractor(SentenceFrameExtractor):
|
|
|
1209
1285
|
stream=stream,
|
|
1210
1286
|
**kwrs
|
|
1211
1287
|
)
|
|
1212
|
-
|
|
1213
|
-
if multi_turn:
|
|
1214
|
-
# update chat messages with LLM outputs
|
|
1215
|
-
messages.append({'role': 'assistant', 'content': gen_text})
|
|
1216
|
-
else:
|
|
1217
|
-
# delete sentence so that message is reset
|
|
1218
|
-
del messages[-1]
|
|
1219
1288
|
|
|
1220
1289
|
# add to output
|
|
1221
1290
|
output.append({'sentence_start': sent['start'],
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
{llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
|
File without changes
|
{llm_ie-0.4.4 → llm_ie-0.4.6}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|