llm-ie 1.2.3__tar.gz → 1.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {llm_ie-1.2.3 → llm_ie-1.2.4}/PKG-INFO +1 -1
  2. {llm_ie-1.2.3 → llm_ie-1.2.4}/pyproject.toml +1 -1
  3. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/__init__.py +4 -4
  4. llm_ie-1.2.4/src/llm_ie/asset/default_prompts/LLMUnitChunker_user_prompt.txt +129 -0
  5. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/chunkers.py +104 -4
  6. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/engines.py +44 -0
  7. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/extractors.py +8 -80
  8. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/prompt_editor.py +9 -32
  9. llm_ie-1.2.4/src/llm_ie/utils.py +95 -0
  10. {llm_ie-1.2.3 → llm_ie-1.2.4}/README.md +0 -0
  11. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
  12. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
  13. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
  14. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
  15. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_addition_review_prompt.txt +0 -0
  16. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_revision_review_prompt.txt +0 -0
  17. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +0 -0
  18. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +0 -0
  19. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +0 -0
  20. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +0 -0
  21. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt +0 -0
  22. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
  23. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt +0 -0
  24. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
  25. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/DirectFrameExtractor_prompt_guide.txt +0 -0
  26. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
  27. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
  28. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +0 -0
  29. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +0 -0
  30. {llm_ie-1.2.3 → llm_ie-1.2.4}/src/llm_ie/data_types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 1.2.3
3
+ Version: 1.2.4
4
4
  Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "llm-ie"
3
- version = "1.2.3"
3
+ version = "1.2.4"
4
4
  description = "A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines."
5
5
  authors = ["Enshuo (David) Hsu"]
6
6
  license = "MIT"
@@ -1,12 +1,12 @@
1
1
  from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
2
2
  from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
3
- from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
3
+ from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, SGLangInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
4
4
  from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
5
- from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
5
+ from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, LLMUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
6
6
  from .prompt_editor import PromptEditor
7
7
 
8
8
  __all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
9
- "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
9
+ "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "SGLangInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
10
10
  "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
11
- "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
11
+ "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "LLMUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
12
12
  "PromptEditor"]
@@ -0,0 +1,129 @@
1
+ ### Task description
2
+ You are a helpful assistant that breaks down a text document into semantic units or chunks. Each chunk should represent a coherent section of the text, such as a paragraph, subsection, or topic.
3
+
4
+ ### Schema definition
5
+ You will output a JSON array of objects. Each object should have the following fields:
6
+ - "title": Generate a brief title summarizing the content of the chunk.
7
+ - "anchor_text": the first line of text in the chunk used to locate it in the original document. Must be an exact match.
8
+ - if there is a title or heading for the chunk, use that as the anchor_text.
9
+ - if there is no title or heading, use the first sentence of the chunk as the anchor_text.
10
+
11
+ ```JSON
12
+ [
13
+ {
14
+ "title": "<your title here>",
15
+ "anchor_text": "<the anchor text of the chunk here>"
16
+ },
17
+ {
18
+ "title": "<your title here>",
19
+ "anchor_text": "<the anchor text of the chunk here>"
20
+ }
21
+ ]
22
+ ```
23
+
24
+ ### Examples
25
+
26
+ **Input**:
27
+ "# Clinical Note
28
+
29
+ **Patient Name**: Michael Green
30
+ **Medical Record Number**: 1122334455
31
+ **Date of Visit**: January 5, 2025
32
+ **Provider**: Dr. Emily Carter, MD
33
+
34
+ ## Reason for Visit
35
+ Follow-up for poorly controlled type 2 diabetes and complaints of occasional dizziness and blurred vision.
36
+
37
+ ## Summary of Visit
38
+ Michael Green, a 62-year-old male with a known history of type 2 diabetes, hypertension, and obesity, presents for follow-up regarding his glycemic control. Despite recent adjustments to his treatment plan, his glucose readings have remained elevated, averaging 180-220 mg/dL. He reports occasional episodes of dizziness and blurred vision, particularly in the morning before meals. He denies chest pain, palpitations, or recent falls. He reports compliance with his medication regimen but admits to difficulty following a consistent low-carbohydrate diet.
39
+
40
+ Michael has been using a glucose meter to monitor his blood sugar levels and logs them daily. His last hemoglobin A1c, performed three months ago, was 9.2%. He reports no recent hospitalizations, infections, or significant stressors.
41
+
42
+ ## Notable History
43
+ - **Chronic Conditions**:
44
+ - Type 2 diabetes mellitus, diagnosed 10 years ago.
45
+ - Hypertension, well-controlled on medication.
46
+ - Hyperlipidemia, on statin therapy.
47
+ - **Past Surgical History**:
48
+ - Knee arthroscopy for a meniscal tear, age 50.
49
+ - **Family History**:
50
+ - Mother: Deceased at 75, complications from diabetes.
51
+ - Father: Deceased at 70, myocardial infarction."
52
+
53
+ **Output**:
54
+ ```JSON
55
+ [
56
+ {
57
+ "title": "Patient Information",
58
+ "anchor_text": "# Clinical Note"
59
+ },
60
+ {
61
+ "title": "Reason for Visit",
62
+ "anchor_text": "## Reason for Visit"
63
+ },
64
+ {
65
+ "title": "Summary of Visit",
66
+ "anchor_text": "## Summary of Visit"
67
+ },
68
+ {
69
+ "title": "Notable History",
70
+ "anchor_text": "## Notable History"
71
+ }
72
+ ]
73
+ ```
74
+
75
+ **Input**:
76
+ "In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an
77
+ ABG 7.16/66/162. He had a CTH which was unremarkable. He then
78
+ had a CTA chest, afterwhich he went into PEA arrest.
79
+ Rescucitation last approximately 10-15 minutes with multiple
80
+ rounds of epi and bicarb, with ROSC. He was then admitted to the
81
+ MICU for further management.
82
+ .
83
+ Currently, the patient is intubated, sedated, and parlyzed.
84
+
85
+ Past Medical History:
86
+ Asthma
87
+ Dilated cardiomyopathy
88
+ Multiple admissions for dyspnea this winter ([**1-26**]).
89
+ Anxiety/depression
90
+ CKD
91
+ HLD
92
+ Obesity
93
+ HTN
94
+
95
+ Social History:
96
+ Unknown
97
+
98
+ Family History:
99
+ Unknown"
100
+
101
+ **Output**:
102
+ ```JSON
103
+ [
104
+ {
105
+ "title": "Patient Presentation and Initial Management",
106
+ "anchor_text": "In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an"
107
+ },
108
+ {
109
+ "title": "Current Status of the Patient",
110
+ "anchor_text": "Currently, the patient is intubated, sedated, and parlyzed."
111
+ },
112
+ {
113
+ "title": "Past Medical History",
114
+ "anchor_text": "Past Medical History:"
115
+ },
116
+ {
117
+ "title": "Social History",
118
+ "anchor_text": "Social History:"
119
+ },
120
+ {
121
+ "title": "Family History",
122
+ "anchor_text": "Family History:"
123
+ }
124
+ ]
125
+ ```
126
+
127
+ ### Document text
128
+
129
+ "{{document_text}}"
@@ -1,8 +1,11 @@
1
1
  import abc
2
- from typing import Set, List, Dict, Tuple, Union, Callable
2
+ from typing import List
3
3
  import asyncio
4
4
  import uuid
5
+ import importlib.resources
6
+ from llm_ie.utils import extract_json, apply_prompt_template
5
7
  from llm_ie.data_types import FrameExtractionUnit
8
+ from llm_ie.engines import InferenceEngine
6
9
 
7
10
 
8
11
  class UnitChunker(abc.ABC):
@@ -74,13 +77,14 @@ class SeparatorUnitChunker(UnitChunker):
74
77
  text : str
75
78
  The document text.
76
79
  """
80
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
77
81
  paragraphs = text.split(self.sep)
78
82
  paragraph_units = []
79
83
  start = 0
80
84
  for paragraph in paragraphs:
81
85
  end = start + len(paragraph)
82
86
  paragraph_units.append(FrameExtractionUnit(
83
- doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
87
+ doc_id=doc_id,
84
88
  start=start,
85
89
  end=end,
86
90
  text=paragraph
@@ -104,10 +108,11 @@ class SentenceUnitChunker(UnitChunker):
104
108
  text : str
105
109
  The document text.
106
110
  """
111
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
107
112
  sentences = []
108
113
  for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
109
114
  sentences.append(FrameExtractionUnit(
110
- doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
115
+ doc_id=doc_id,
111
116
  start=start,
112
117
  end=end,
113
118
  text=text[start:end]
@@ -129,13 +134,14 @@ class TextLineUnitChunker(UnitChunker):
129
134
  text : str
130
135
  The document text.
131
136
  """
137
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
132
138
  lines = text.split('\n')
133
139
  line_units = []
134
140
  start = 0
135
141
  for line in lines:
136
142
  end = start + len(line)
137
143
  line_units.append(FrameExtractionUnit(
138
- doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
144
+ doc_id=doc_id,
139
145
  start=start,
140
146
  end=end,
141
147
  text=line
@@ -143,6 +149,100 @@ class TextLineUnitChunker(UnitChunker):
143
149
  start = end + 1
144
150
  return line_units
145
151
 
152
+ class LLMUnitChunker(UnitChunker):
153
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str=None, system_prompt:str=None):
154
+ """
155
+ This class prompt an LLM for document segmentation (e.g., sections, paragraphs).
156
+
157
+ Parameters:
158
+ ----------
159
+ inference_engine : InferenceEngine
160
+ the LLM inferencing engine object.
161
+ prompt_template : str
162
+ the prompt template that defines how to chunk the document. Must define a JSON schema with
163
+ ```json
164
+ [
165
+ {
166
+ "title": "<your title here>",
167
+ "anchor_text": "<the anchor text of the chunk here>"
168
+ },
169
+ {
170
+ "title": "<your title here>",
171
+ "anchor_text": "<the anchor text of the chunk here>"
172
+ }
173
+ ]
174
+ ```
175
+ system_prompt : str, optional
176
+ The system prompt.
177
+ """
178
+ self.inference_engine = inference_engine
179
+
180
+ if prompt_template is None:
181
+ file_path = importlib.resources.files('llm_ie.asset.default_prompts').joinpath("LLMUnitChunker_user_prompt.txt")
182
+ with open(file_path, 'r', encoding="utf-8") as f:
183
+ self.prompt_template = f.read()
184
+ else:
185
+ self.prompt_template = prompt_template
186
+
187
+ self.system_prompt = system_prompt
188
+
189
+ def chunk(self, text, doc_id=None) -> List[FrameExtractionUnit]:
190
+ """
191
+ Parameters:
192
+ -----------
193
+ text : str
194
+ the document text.
195
+ doc_id : str, optional
196
+ the document id.
197
+ """
198
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
199
+ user_prompt = apply_prompt_template(prompt_template=self.prompt_template, text_content=text)
200
+ messages = []
201
+ if self.system_prompt is not None:
202
+ messages.append({'role': 'system', 'content': self.system_prompt})
203
+ messages.append({'role': 'user', 'content': user_prompt})
204
+
205
+ gen_text = self.inference_engine.chat(messages=messages)
206
+
207
+ header_list = extract_json(gen_text=gen_text["response"])
208
+ units = []
209
+ start = 0
210
+ prev_end = 0
211
+ for header in header_list:
212
+ if "anchor_text" not in header:
213
+ Warning.warn(f"Missing anchor_text in header: {header}. Skipping this header.")
214
+ continue
215
+ if not isinstance(header["anchor_text"], str):
216
+ Warning.warn(f"Invalid anchor_text: {header['anchor_text']}. Skipping this header.")
217
+ continue
218
+
219
+ start = prev_end
220
+ # find the first instandce of the leading sentence in the rest of the text
221
+ end = text.find(header["anchor_text"], start)
222
+ # if not found, skip this header
223
+ if end == -1:
224
+ continue
225
+ # if start == end (empty text), skip this header
226
+ if start == end:
227
+ continue
228
+ # create a frame extraction unit
229
+ units.append(FrameExtractionUnit(
230
+ doc_id=doc_id,
231
+ start=start,
232
+ end=end,
233
+ text=text[start:end]
234
+ ))
235
+ prev_end = end
236
+ # add the last section
237
+ if prev_end < len(text):
238
+ units.append(FrameExtractionUnit(
239
+ doc_id=doc_id,
240
+ start=prev_end,
241
+ end=len(text),
242
+ text=text[prev_end:]
243
+ ))
244
+ return units
245
+
146
246
 
147
247
  class ContextChunker(abc.ABC):
148
248
  def __init__(self):
@@ -1060,6 +1060,50 @@ class VLLMInferenceEngine(OpenAICompatibleInferenceEngine):
1060
1060
  return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
1061
1061
  "response": getattr(response.choices[0].message, "content", "")}
1062
1062
 
1063
+ class SGLangInferenceEngine(OpenAICompatibleInferenceEngine):
1064
+ def __init__(self, model:str, api_key:str="", base_url:str="http://localhost:30000/v1", config:LLMConfig=None, **kwrs):
1065
+ """
1066
+ SGLang OpenAI compatible API inference engine.
1067
+ https://docs.sglang.ai/basic_usage/openai_api.html
1068
+
1069
+ Parameters:
1070
+ ----------
1071
+ model_name : str
1072
+ model name as shown in the vLLM server
1073
+ api_key : str, Optional
1074
+ the API key for the vLLM server.
1075
+ base_url : str, Optional
1076
+ the base url for the vLLM server.
1077
+ config : LLMConfig
1078
+ the LLM configuration.
1079
+ """
1080
+ super().__init__(model, api_key, base_url, config, **kwrs)
1081
+
1082
+
1083
+ def _format_response(self, response: Any) -> Dict[str, str]:
1084
+ """
1085
+ This method format the response from OpenAI API to a dict with keys "type" and "data".
1086
+
1087
+ Parameters:
1088
+ ----------
1089
+ response : Any
1090
+ the response from OpenAI-compatible API. Could be a dict, generator, or object.
1091
+ """
1092
+ if isinstance(response, self.ChatCompletionChunk):
1093
+ if hasattr(response.choices[0].delta, "reasoning_content") and getattr(response.choices[0].delta, "reasoning_content") is not None:
1094
+ chunk_text = getattr(response.choices[0].delta, "reasoning_content", "")
1095
+ if chunk_text is None:
1096
+ chunk_text = ""
1097
+ return {"type": "reasoning", "data": chunk_text}
1098
+ else:
1099
+ chunk_text = getattr(response.choices[0].delta, "content", "")
1100
+ if chunk_text is None:
1101
+ chunk_text = ""
1102
+ return {"type": "response", "data": chunk_text}
1103
+
1104
+ return {"reasoning": getattr(response.choices[0].message, "reasoning_content", ""),
1105
+ "response": getattr(response.choices[0].message, "content", "")}
1106
+
1063
1107
 
1064
1108
  class OpenRouterInferenceEngine(OpenAICompatibleInferenceEngine):
1065
1109
  def __init__(self, model:str, api_key:str=None, base_url:str="https://openrouter.ai/api/v1", config:LLMConfig=None, **kwrs):
@@ -1,7 +1,5 @@
1
1
  import abc
2
2
  import re
3
- import json
4
- import json_repair
5
3
  import inspect
6
4
  import importlib.resources
7
5
  import warnings
@@ -10,6 +8,7 @@ import asyncio
10
8
  import nest_asyncio
11
9
  from concurrent.futures import ThreadPoolExecutor
12
10
  from typing import Any, Set, List, Dict, Tuple, Union, Callable, Generator, Optional, AsyncGenerator
11
+ from llm_ie.utils import extract_json, apply_prompt_template
13
12
  from llm_ie.data_types import FrameExtractionUnit, LLMInformationExtractionFrame, LLMInformationExtractionDocument
14
13
  from llm_ie.chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker
15
14
  from llm_ie.chunkers import ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
@@ -96,79 +95,8 @@ class Extractor:
96
95
  Returns : str
97
96
  a user prompt.
98
97
  """
99
- pattern = re.compile(r'{{(.*?)}}')
100
- if isinstance(text_content, str):
101
- matches = pattern.findall(self.prompt_template)
102
- if len(matches) != 1:
103
- raise ValueError("When text_content is str, the prompt template must has exactly 1 placeholder {{<placeholder name>}}.")
104
- text = re.sub(r'\\', r'\\\\', text_content)
105
- prompt = pattern.sub(text, self.prompt_template)
98
+ return apply_prompt_template(self.prompt_template, text_content)
106
99
 
107
- elif isinstance(text_content, dict):
108
- # Check if all values are str
109
- if not all([isinstance(v, str) for v in text_content.values()]):
110
- raise ValueError("All values in text_content must be str.")
111
- # Check if all keys are in the prompt template
112
- placeholders = pattern.findall(self.prompt_template)
113
- if len(placeholders) != len(text_content):
114
- raise ValueError(f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size.")
115
- if not all([k in placeholders for k, _ in text_content.items()]):
116
- raise ValueError(f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders}).")
117
-
118
- prompt = pattern.sub(lambda match: re.sub(r'\\', r'\\\\', text_content[match.group(1)]), self.prompt_template)
119
-
120
- return prompt
121
-
122
- def _find_dict_strings(self, text: str) -> List[str]:
123
- """
124
- Extracts balanced JSON-like dictionaries from a string, even if nested.
125
-
126
- Parameters:
127
- -----------
128
- text : str
129
- the input text containing JSON-like structures.
130
-
131
- Returns : List[str]
132
- A list of valid JSON-like strings representing dictionaries.
133
- """
134
- open_brace = 0
135
- start = -1
136
- json_objects = []
137
-
138
- for i, char in enumerate(text):
139
- if char == '{':
140
- if open_brace == 0:
141
- # start of a new JSON object
142
- start = i
143
- open_brace += 1
144
- elif char == '}':
145
- open_brace -= 1
146
- if open_brace == 0 and start != -1:
147
- json_objects.append(text[start:i + 1])
148
- start = -1
149
-
150
- return json_objects
151
-
152
-
153
- def _extract_json(self, gen_text:str) -> List[Dict[str, str]]:
154
- """
155
- This method inputs a generated text and output a JSON of information tuples
156
- """
157
- out = []
158
- dict_str_list = self._find_dict_strings(gen_text)
159
- for dict_str in dict_str_list:
160
- try:
161
- dict_obj = json.loads(dict_str)
162
- out.append(dict_obj)
163
- except json.JSONDecodeError:
164
- dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
165
- if dict_obj:
166
- warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
167
- out.append(dict_obj)
168
- else:
169
- warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
170
- return out
171
-
172
100
 
173
101
  class FrameExtractor(Extractor):
174
102
  from nltk.tokenize import RegexpTokenizer
@@ -759,7 +687,7 @@ class DirectFrameExtractor(FrameExtractor):
759
687
  if unit.status != "success":
760
688
  warnings.warn(f"Skipping failed unit ({unit.start}, {unit.end}): {unit.text}", RuntimeWarning)
761
689
  continue
762
- for entity in self._extract_json(gen_text=unit.gen_text):
690
+ for entity in extract_json(gen_text=unit.gen_text):
763
691
  if ENTITY_KEY in entity:
764
692
  entity_json.append(entity)
765
693
  else:
@@ -963,7 +891,7 @@ class DirectFrameExtractor(FrameExtractor):
963
891
  frame_list = []
964
892
  for res in sorted(doc_results['units'], key=lambda r: r.start):
965
893
  entity_json = []
966
- for entity in self._extract_json(gen_text=res.gen_text):
894
+ for entity in extract_json(gen_text=res.gen_text):
967
895
  if ENTITY_KEY in entity:
968
896
  entity_json.append(entity)
969
897
  else:
@@ -1712,7 +1640,7 @@ class AttributeExtractor(Extractor):
1712
1640
  messages_logger=messages_logger
1713
1641
  )
1714
1642
 
1715
- attribute_list = self._extract_json(gen_text=gen_text["response"])
1643
+ attribute_list = extract_json(gen_text=gen_text["response"])
1716
1644
  if isinstance(attribute_list, list) and len(attribute_list) > 0:
1717
1645
  attributes = attribute_list[0]
1718
1646
  if return_messages_log:
@@ -1822,7 +1750,7 @@ class AttributeExtractor(Extractor):
1822
1750
  messages.append({'role': 'user', 'content': self._get_user_prompt({"context": context, "frame": str(frame.to_dict())})})
1823
1751
 
1824
1752
  gen_text = await self.inference_engine.chat_async(messages=messages, messages_logger=messages_logger)
1825
- attribute_list = self._extract_json(gen_text=gen_text["response"])
1753
+ attribute_list = extract_json(gen_text=gen_text["response"])
1826
1754
  attributes = attribute_list[0] if isinstance(attribute_list, list) and len(attribute_list) > 0 else {}
1827
1755
  return {"frame": frame, "attributes": attributes, "messages": messages}
1828
1756
 
@@ -2075,7 +2003,7 @@ class BinaryRelationExtractor(RelationExtractor):
2075
2003
  return None
2076
2004
 
2077
2005
  def _post_process_result(self, gen_text: str, pair_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
2078
- rel_json = self._extract_json(gen_text)
2006
+ rel_json = extract_json(gen_text)
2079
2007
  if len(rel_json) > 0 and "Relation" in rel_json[0]:
2080
2008
  rel = rel_json[0]["Relation"]
2081
2009
  if (isinstance(rel, bool) and rel) or (isinstance(rel, str) and rel.lower() == 'true'):
@@ -2141,7 +2069,7 @@ class MultiClassRelationExtractor(RelationExtractor):
2141
2069
  return None
2142
2070
 
2143
2071
  def _post_process_result(self, gen_text: str, pair_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
2144
- rel_json = self._extract_json(gen_text)
2072
+ rel_json = extract_json(gen_text)
2145
2073
  pos_rel_types = pair_data['pos_rel_types']
2146
2074
  if len(rel_json) > 0 and "RelationType" in rel_json[0]:
2147
2075
  rel_type = rel_json[0]["RelationType"]
@@ -2,6 +2,7 @@ import sys
2
2
  import warnings
3
3
  from typing import List, Dict, Generator
4
4
  import importlib.resources
5
+ from llm_ie.utils import apply_prompt_template
5
6
  from llm_ie.engines import InferenceEngine
6
7
  from llm_ie.extractors import FrameExtractor
7
8
  import re
@@ -45,30 +46,6 @@ class PromptEditor:
45
46
 
46
47
  # internal memory (history messages) for the `chat` method
47
48
  self.messages = []
48
-
49
- def _apply_prompt_template(self, text_content:Dict[str,str], prompt_template:str) -> str:
50
- """
51
- This method applies text_content to prompt_template and returns a prompt.
52
-
53
- Parameters
54
- ----------
55
- text_content : Dict[str,str]
56
- the input text content to put in prompt template.
57
- all the keys must be included in the prompt template placeholder {{<placeholder name>}}.
58
-
59
- Returns : str
60
- a prompt.
61
- """
62
- pattern = re.compile(r'{{(.*?)}}')
63
- placeholders = pattern.findall(prompt_template)
64
- if len(placeholders) != len(text_content):
65
- raise ValueError(f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size.")
66
- if not all([k in placeholders for k, _ in text_content.items()]):
67
- raise ValueError(f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders}).")
68
-
69
- prompt = pattern.sub(lambda match: re.sub(r'\\', r'\\\\', text_content[match.group(1)]), prompt_template)
70
-
71
- return prompt
72
49
 
73
50
 
74
51
  def rewrite(self, draft:str) -> str:
@@ -80,8 +57,8 @@ class PromptEditor:
80
57
  with open(file_path, 'r') as f:
81
58
  rewrite_prompt_template = f.read()
82
59
 
83
- prompt = self._apply_prompt_template(text_content={"draft": draft, "prompt_guideline": self.prompt_guide},
84
- prompt_template=rewrite_prompt_template)
60
+ prompt = apply_prompt_template(prompt_template=rewrite_prompt_template,
61
+ text_content={"draft": draft, "prompt_guideline": self.prompt_guide})
85
62
  messages = [{"role": "system", "content": self.system_prompt},
86
63
  {"role": "user", "content": prompt}]
87
64
  res = self.inference_engine.chat(messages, verbose=True)
@@ -96,8 +73,8 @@ class PromptEditor:
96
73
  with open(file_path, 'r') as f:
97
74
  comment_prompt_template = f.read()
98
75
 
99
- prompt = self._apply_prompt_template(text_content={"draft": draft, "prompt_guideline": self.prompt_guide},
100
- prompt_template=comment_prompt_template)
76
+ prompt = apply_prompt_template(prompt_template=comment_prompt_template,
77
+ text_content={"draft": draft, "prompt_guideline": self.prompt_guide})
101
78
  messages = [{"role": "system", "content": self.system_prompt},
102
79
  {"role": "user", "content": prompt}]
103
80
  res = self.inference_engine.chat(messages, verbose=True)
@@ -254,8 +231,8 @@ class PromptEditor:
254
231
  with open(file_path, 'r') as f:
255
232
  chat_prompt_template = f.read()
256
233
 
257
- guideline = self._apply_prompt_template(text_content={"prompt_guideline": self.prompt_guide},
258
- prompt_template=chat_prompt_template)
234
+ guideline = apply_prompt_template(prompt_template=chat_prompt_template,
235
+ text_content={"prompt_guideline": self.prompt_guide})
259
236
 
260
237
  self.messages = [{"role": "system", "content": self.system_prompt + guideline}]
261
238
 
@@ -288,8 +265,8 @@ class PromptEditor:
288
265
  with open(file_path, 'r') as f:
289
266
  chat_prompt_template = f.read()
290
267
 
291
- guideline = self._apply_prompt_template(text_content={"prompt_guideline": self.prompt_guide},
292
- prompt_template=chat_prompt_template)
268
+ guideline = apply_prompt_template(prompt_template=chat_prompt_template,
269
+ text_content={"prompt_guideline": self.prompt_guide})
293
270
 
294
271
  messages = [{"role": "system", "content": self.system_prompt + guideline}] + messages
295
272
 
@@ -0,0 +1,95 @@
1
+ from typing import List, Dict, Union
2
+ import re
3
+ import json
4
+ import warnings
5
+ import json_repair
6
+
7
+ def _find_dict_strings(text: str) -> List[str]:
8
+ """
9
+ Extracts balanced JSON-like dictionaries from a string, even if nested.
10
+
11
+ Parameters:
12
+ -----------
13
+ text : str
14
+ the input text containing JSON-like structures.
15
+
16
+ Returns : List[str]
17
+ A list of valid JSON-like strings representing dictionaries.
18
+ """
19
+ open_brace = 0
20
+ start = -1
21
+ json_objects = []
22
+
23
+ for i, char in enumerate(text):
24
+ if char == '{':
25
+ if open_brace == 0:
26
+ # start of a new JSON object
27
+ start = i
28
+ open_brace += 1
29
+ elif char == '}':
30
+ open_brace -= 1
31
+ if open_brace == 0 and start != -1:
32
+ json_objects.append(text[start:i + 1])
33
+ start = -1
34
+
35
+ return json_objects
36
+
37
+
38
+ def extract_json(gen_text:str) -> List[Dict[str, str]]:
39
+ """
40
+ This method inputs a generated text and output a JSON of information tuples
41
+ """
42
+ out = []
43
+ dict_str_list = _find_dict_strings(gen_text)
44
+ for dict_str in dict_str_list:
45
+ try:
46
+ dict_obj = json.loads(dict_str)
47
+ out.append(dict_obj)
48
+ except json.JSONDecodeError:
49
+ dict_obj = json_repair.repair_json(dict_str, skip_json_loads=True, return_objects=True)
50
+ if dict_obj:
51
+ warnings.warn(f'JSONDecodeError detected, fixed with repair_json:\n{dict_str}', RuntimeWarning)
52
+ out.append(dict_obj)
53
+ else:
54
+ warnings.warn(f'JSONDecodeError could not be fixed:\n{dict_str}', RuntimeWarning)
55
+ return out
56
+
57
+
58
+ def apply_prompt_template(prompt_template:str, text_content:Union[str, Dict[str,str]]) -> str:
59
+ """
60
+ This method applies text_content to prompt_template and returns a prompt.
61
+
62
+ Parameters:
63
+ ----------
64
+ prompt_template : str
65
+ the prompt template with placeholders {{<placeholder name>}}.
66
+ text_content : Union[str, Dict[str,str]]
67
+ the input text content to put in prompt template.
68
+ If str, the prompt template must has only 1 placeholder {{<placeholder name>}}, regardless of placeholder name.
69
+ If dict, all the keys must be included in the prompt template placeholder {{<placeholder name>}}. All values must be str.
70
+
71
+ Returns : str
72
+ a user prompt.
73
+ """
74
+ pattern = re.compile(r'{{(.*?)}}')
75
+ if isinstance(text_content, str):
76
+ matches = pattern.findall(prompt_template)
77
+ if len(matches) != 1:
78
+ raise ValueError("When text_content is str, the prompt template must has exactly 1 placeholder {{<placeholder name>}}.")
79
+ text = re.sub(r'\\', r'\\\\', text_content)
80
+ prompt = pattern.sub(text, prompt_template)
81
+
82
+ elif isinstance(text_content, dict):
83
+ # Check if all values are str
84
+ if not all([isinstance(v, str) for v in text_content.values()]):
85
+ raise ValueError("All values in text_content must be str.")
86
+ # Check if all keys are in the prompt template
87
+ placeholders = pattern.findall(prompt_template)
88
+ if len(placeholders) != len(text_content):
89
+ raise ValueError(f"Expect text_content ({len(text_content)}) and prompt template placeholder ({len(placeholders)}) to have equal size.")
90
+ if not all([k in placeholders for k, _ in text_content.items()]):
91
+ raise ValueError(f"All keys in text_content ({text_content.keys()}) must match placeholders in prompt template ({placeholders}).")
92
+
93
+ prompt = pattern.sub(lambda match: re.sub(r'\\', r'\\\\', text_content[match.group(1)]), prompt_template)
94
+
95
+ return prompt
File without changes
File without changes