llm-ie 1.2.2__tar.gz → 1.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {llm_ie-1.2.2 → llm_ie-1.2.4}/PKG-INFO +1 -1
  2. {llm_ie-1.2.2 → llm_ie-1.2.4}/pyproject.toml +1 -1
  3. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/__init__.py +5 -4
  4. llm_ie-1.2.4/src/llm_ie/asset/default_prompts/LLMUnitChunker_user_prompt.txt +129 -0
  5. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/chunkers.py +145 -6
  6. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/data_types.py +23 -37
  7. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/engines.py +621 -61
  8. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/extractors.py +341 -297
  9. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/prompt_editor.py +9 -32
  10. llm_ie-1.2.4/src/llm_ie/utils.py +95 -0
  11. {llm_ie-1.2.2 → llm_ie-1.2.4}/README.md +0 -0
  12. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/chat.txt +0 -0
  13. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/comment.txt +0 -0
  14. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/rewrite.txt +0 -0
  15. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/PromptEditor_prompts/system.txt +0 -0
  16. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_addition_review_prompt.txt +0 -0
  17. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/BasicReviewFrameExtractor_revision_review_prompt.txt +0 -0
  18. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_addition_review_prompt.txt +0 -0
  19. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/ReviewFrameExtractor_revision_review_prompt.txt +0 -0
  20. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_addition_review_prompt.txt +0 -0
  21. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/default_prompts/SentenceReviewFrameExtractor_revision_review_prompt.txt +0 -0
  22. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/AttributeExtractor_prompt_guide.txt +0 -0
  23. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BasicFrameExtractor_prompt_guide.txt +0 -0
  24. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BasicReviewFrameExtractor_prompt_guide.txt +0 -0
  25. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/BinaryRelationExtractor_prompt_guide.txt +0 -0
  26. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/DirectFrameExtractor_prompt_guide.txt +0 -0
  27. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/MultiClassRelationExtractor_prompt_guide.txt +0 -0
  28. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/ReviewFrameExtractor_prompt_guide.txt +0 -0
  29. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt +0 -0
  30. {llm_ie-1.2.2 → llm_ie-1.2.4}/src/llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llm-ie
3
- Version: 1.2.2
3
+ Version: 1.2.4
4
4
  Summary: A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines.
5
5
  License: MIT
6
6
  Author: Enshuo (David) Hsu
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "llm-ie"
3
- version = "1.2.2"
3
+ version = "1.2.4"
4
4
  description = "A comprehensive toolkit that provides building blocks for LLM-based named entity recognition, attribute extraction, and relation extraction pipelines."
5
5
  authors = ["Enshuo (David) Hsu"]
6
6
  license = "MIT"
@@ -1,11 +1,12 @@
1
1
  from .data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
2
- from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig, LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
2
+ from .engines import BasicLLMConfig, ReasoningLLMConfig, Qwen3LLMConfig, OpenAIReasoningLLMConfig
3
+ from .engines import LlamaCppInferenceEngine, OllamaInferenceEngine, HuggingFaceHubInferenceEngine, VLLMInferenceEngine, SGLangInferenceEngine, OpenRouterInferenceEngine, OpenAIInferenceEngine, AzureOpenAIInferenceEngine, LiteLLMInferenceEngine
3
4
  from .extractors import DirectFrameExtractor, ReviewFrameExtractor, BasicFrameExtractor, BasicReviewFrameExtractor, SentenceFrameExtractor, SentenceReviewFrameExtractor, AttributeExtractor, BinaryRelationExtractor, MultiClassRelationExtractor
4
- from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
5
+ from .chunkers import UnitChunker, WholeDocumentUnitChunker, SentenceUnitChunker, TextLineUnitChunker, SeparatorUnitChunker, LLMUnitChunker, ContextChunker, NoContextChunker, WholeDocumentContextChunker, SlideWindowContextChunker
5
6
  from .prompt_editor import PromptEditor
6
7
 
7
8
  __all__ = ["LLMInformationExtractionFrame", "LLMInformationExtractionDocument",
8
- "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
9
+ "BasicLLMConfig", "ReasoningLLMConfig", "Qwen3LLMConfig", "OpenAIReasoningLLMConfig", "LlamaCppInferenceEngine", "OllamaInferenceEngine", "HuggingFaceHubInferenceEngine", "VLLMInferenceEngine", "SGLangInferenceEngine", "OpenRouterInferenceEngine", "OpenAIInferenceEngine", "AzureOpenAIInferenceEngine", "LiteLLMInferenceEngine",
9
10
  "DirectFrameExtractor", "ReviewFrameExtractor", "BasicFrameExtractor", "BasicReviewFrameExtractor", "SentenceFrameExtractor", "SentenceReviewFrameExtractor", "AttributeExtractor", "BinaryRelationExtractor", "MultiClassRelationExtractor",
10
- "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
11
+ "UnitChunker", "WholeDocumentUnitChunker", "SentenceUnitChunker", "TextLineUnitChunker", "SeparatorUnitChunker", "LLMUnitChunker", "ContextChunker", "NoContextChunker", "WholeDocumentContextChunker", "SlideWindowContextChunker",
11
12
  "PromptEditor"]
@@ -0,0 +1,129 @@
1
+ ### Task description
2
+ You are a helpful assistant that breaks down a text document into semantic units or chunks. Each chunk should represent a coherent section of the text, such as a paragraph, subsection, or topic.
3
+
4
+ ### Schema definition
5
+ You will output a JSON array of objects. Each object should have the following fields:
6
+ - "title": Generate a brief title summarizing the content of the chunk.
7
+ - "anchor_text": the first line of text in the chunk used to locate it in the original document. Must be an exact match.
8
+ - if there is a title or heading for the chunk, use that as the anchor_text.
9
+ - if there is no title or heading, use the first sentence of the chunk as the anchor_text.
10
+
11
+ ```JSON
12
+ [
13
+ {
14
+ "title": "<your title here>",
15
+ "anchor_text": "<the anchor text of the chunk here>"
16
+ },
17
+ {
18
+ "title": "<your title here>",
19
+ "anchor_text": "<the anchor text of the chunk here>"
20
+ }
21
+ ]
22
+ ```
23
+
24
+ ### Examples
25
+
26
+ **Input**:
27
+ "# Clinical Note
28
+
29
+ **Patient Name**: Michael Green
30
+ **Medical Record Number**: 1122334455
31
+ **Date of Visit**: January 5, 2025
32
+ **Provider**: Dr. Emily Carter, MD
33
+
34
+ ## Reason for Visit
35
+ Follow-up for poorly controlled type 2 diabetes and complaints of occasional dizziness and blurred vision.
36
+
37
+ ## Summary of Visit
38
+ Michael Green, a 62-year-old male with a known history of type 2 diabetes, hypertension, and obesity, presents for follow-up regarding his glycemic control. Despite recent adjustments to his treatment plan, his glucose readings have remained elevated, averaging 180-220 mg/dL. He reports occasional episodes of dizziness and blurred vision, particularly in the morning before meals. He denies chest pain, palpitations, or recent falls. He reports compliance with his medication regimen but admits to difficulty following a consistent low-carbohydrate diet.
39
+
40
+ Michael has been using a glucose meter to monitor his blood sugar levels and logs them daily. His last hemoglobin A1c, performed three months ago, was 9.2%. He reports no recent hospitalizations, infections, or significant stressors.
41
+
42
+ ## Notable History
43
+ - **Chronic Conditions**:
44
+ - Type 2 diabetes mellitus, diagnosed 10 years ago.
45
+ - Hypertension, well-controlled on medication.
46
+ - Hyperlipidemia, on statin therapy.
47
+ - **Past Surgical History**:
48
+ - Knee arthroscopy for a meniscal tear, age 50.
49
+ - **Family History**:
50
+ - Mother: Deceased at 75, complications from diabetes.
51
+ - Father: Deceased at 70, myocardial infarction."
52
+
53
+ **Output**:
54
+ ```JSON
55
+ [
56
+ {
57
+ "title": "Patient Information",
58
+ "anchor_text": "# Clinical Note"
59
+ },
60
+ {
61
+ "title": "Reason for Visit",
62
+ "anchor_text": "## Reason for Visit"
63
+ },
64
+ {
65
+ "title": "Summary of Visit",
66
+ "anchor_text": "## Summary of Visit"
67
+ },
68
+ {
69
+ "title": "Notable History",
70
+ "anchor_text": "## Notable History"
71
+ }
72
+ ]
73
+ ```
74
+
75
+ **Input**:
76
+ "In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an
77
+ ABG 7.16/66/162. He had a CTH which was unremarkable. He then
78
+ had a CTA chest, afterwhich he went into PEA arrest.
79
+ Rescucitation last approximately 10-15 minutes with multiple
80
+ rounds of epi and bicarb, with ROSC. He was then admitted to the
81
+ MICU for further management.
82
+ .
83
+ Currently, the patient is intubated, sedated, and parlyzed.
84
+
85
+ Past Medical History:
86
+ Asthma
87
+ Dilated cardiomyopathy
88
+ Multiple admissions for dyspnea this winter ([**1-26**]).
89
+ Anxiety/depression
90
+ CKD
91
+ HLD
92
+ Obesity
93
+ HTN
94
+
95
+ Social History:
96
+ Unknown
97
+
98
+ Family History:
99
+ Unknown"
100
+
101
+ **Output**:
102
+ ```JSON
103
+ [
104
+ {
105
+ "title": "Patient Presentation and Initial Management",
106
+ "anchor_text": "In the [**Hospital1 18**] ED, 35.3 102 133/58 100%AC 500x20, 5, 1.0 with an"
107
+ },
108
+ {
109
+ "title": "Current Status of the Patient",
110
+ "anchor_text": "Currently, the patient is intubated, sedated, and parlyzed."
111
+ },
112
+ {
113
+ "title": "Past Medical History",
114
+ "anchor_text": "Past Medical History:"
115
+ },
116
+ {
117
+ "title": "Social History",
118
+ "anchor_text": "Social History:"
119
+ },
120
+ {
121
+ "title": "Family History",
122
+ "anchor_text": "Family History:"
123
+ }
124
+ ]
125
+ ```
126
+
127
+ ### Document text
128
+
129
+ "{{document_text}}"
@@ -1,6 +1,11 @@
1
1
  import abc
2
- from typing import Set, List, Dict, Tuple, Union, Callable
2
+ from typing import List
3
+ import asyncio
4
+ import uuid
5
+ import importlib.resources
6
+ from llm_ie.utils import extract_json, apply_prompt_template
3
7
  from llm_ie.data_types import FrameExtractionUnit
8
+ from llm_ie.engines import InferenceEngine
4
9
 
5
10
 
6
11
  class UnitChunker(abc.ABC):
@@ -11,7 +16,8 @@ class UnitChunker(abc.ABC):
11
16
  """
12
17
  pass
13
18
 
14
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
19
+ @abc.abstractmethod
20
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
15
21
  """
16
22
  Parameters:
17
23
  ----------
@@ -20,6 +26,12 @@ class UnitChunker(abc.ABC):
20
26
  """
21
27
  return NotImplemented
22
28
 
29
+ async def chunk_async(self, text:str, doc_id:str=None, executor=None) -> List[FrameExtractionUnit]:
30
+ """
31
+ asynchronous version of chunk method.
32
+ """
33
+ loop = asyncio.get_running_loop()
34
+ return await loop.run_in_executor(executor, self.chunk, text, doc_id)
23
35
 
24
36
  class WholeDocumentUnitChunker(UnitChunker):
25
37
  def __init__(self):
@@ -28,7 +40,7 @@ class WholeDocumentUnitChunker(UnitChunker):
28
40
  """
29
41
  super().__init__()
30
42
 
31
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
43
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
32
44
  """
33
45
  Parameters:
34
46
  ----------
@@ -36,6 +48,7 @@ class WholeDocumentUnitChunker(UnitChunker):
36
48
  The document text.
37
49
  """
38
50
  return [FrameExtractionUnit(
51
+ doc_id=doc_id if doc_id is not None else str(uuid.uuid4()),
39
52
  start=0,
40
53
  end=len(text),
41
54
  text=text
@@ -57,19 +70,21 @@ class SeparatorUnitChunker(UnitChunker):
57
70
 
58
71
  self.sep = sep
59
72
 
60
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
73
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
61
74
  """
62
75
  Parameters:
63
76
  ----------
64
77
  text : str
65
78
  The document text.
66
79
  """
80
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
67
81
  paragraphs = text.split(self.sep)
68
82
  paragraph_units = []
69
83
  start = 0
70
84
  for paragraph in paragraphs:
71
85
  end = start + len(paragraph)
72
86
  paragraph_units.append(FrameExtractionUnit(
87
+ doc_id=doc_id,
73
88
  start=start,
74
89
  end=end,
75
90
  text=paragraph
@@ -77,6 +92,7 @@ class SeparatorUnitChunker(UnitChunker):
77
92
  start = end + len(self.sep)
78
93
  return paragraph_units
79
94
 
95
+
80
96
  class SentenceUnitChunker(UnitChunker):
81
97
  from nltk.tokenize.punkt import PunktSentenceTokenizer
82
98
  def __init__(self):
@@ -85,16 +101,18 @@ class SentenceUnitChunker(UnitChunker):
85
101
  """
86
102
  super().__init__()
87
103
 
88
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
104
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
89
105
  """
90
106
  Parameters:
91
107
  ----------
92
108
  text : str
93
109
  The document text.
94
110
  """
111
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
95
112
  sentences = []
96
113
  for start, end in self.PunktSentenceTokenizer().span_tokenize(text):
97
114
  sentences.append(FrameExtractionUnit(
115
+ doc_id=doc_id,
98
116
  start=start,
99
117
  end=end,
100
118
  text=text[start:end]
@@ -109,19 +127,21 @@ class TextLineUnitChunker(UnitChunker):
109
127
  """
110
128
  super().__init__()
111
129
 
112
- def chunk(self, text:str) -> List[FrameExtractionUnit]:
130
+ def chunk(self, text:str, doc_id:str=None) -> List[FrameExtractionUnit]:
113
131
  """
114
132
  Parameters:
115
133
  ----------
116
134
  text : str
117
135
  The document text.
118
136
  """
137
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
119
138
  lines = text.split('\n')
120
139
  line_units = []
121
140
  start = 0
122
141
  for line in lines:
123
142
  end = start + len(line)
124
143
  line_units.append(FrameExtractionUnit(
144
+ doc_id=doc_id,
125
145
  start=start,
126
146
  end=end,
127
147
  text=line
@@ -129,6 +149,100 @@ class TextLineUnitChunker(UnitChunker):
129
149
  start = end + 1
130
150
  return line_units
131
151
 
152
+ class LLMUnitChunker(UnitChunker):
153
+ def __init__(self, inference_engine:InferenceEngine, prompt_template:str=None, system_prompt:str=None):
154
+ """
155
+ This class prompt an LLM for document segmentation (e.g., sections, paragraphs).
156
+
157
+ Parameters:
158
+ ----------
159
+ inference_engine : InferenceEngine
160
+ the LLM inferencing engine object.
161
+ prompt_template : str
162
+ the prompt template that defines how to chunk the document. Must define a JSON schema with
163
+ ```json
164
+ [
165
+ {
166
+ "title": "<your title here>",
167
+ "anchor_text": "<the anchor text of the chunk here>"
168
+ },
169
+ {
170
+ "title": "<your title here>",
171
+ "anchor_text": "<the anchor text of the chunk here>"
172
+ }
173
+ ]
174
+ ```
175
+ system_prompt : str, optional
176
+ The system prompt.
177
+ """
178
+ self.inference_engine = inference_engine
179
+
180
+ if prompt_template is None:
181
+ file_path = importlib.resources.files('llm_ie.asset.default_prompts').joinpath("LLMUnitChunker_user_prompt.txt")
182
+ with open(file_path, 'r', encoding="utf-8") as f:
183
+ self.prompt_template = f.read()
184
+ else:
185
+ self.prompt_template = prompt_template
186
+
187
+ self.system_prompt = system_prompt
188
+
189
+ def chunk(self, text, doc_id=None) -> List[FrameExtractionUnit]:
190
+ """
191
+ Parameters:
192
+ -----------
193
+ text : str
194
+ the document text.
195
+ doc_id : str, optional
196
+ the document id.
197
+ """
198
+ doc_id = doc_id if doc_id is not None else str(uuid.uuid4())
199
+ user_prompt = apply_prompt_template(prompt_template=self.prompt_template, text_content=text)
200
+ messages = []
201
+ if self.system_prompt is not None:
202
+ messages.append({'role': 'system', 'content': self.system_prompt})
203
+ messages.append({'role': 'user', 'content': user_prompt})
204
+
205
+ gen_text = self.inference_engine.chat(messages=messages)
206
+
207
+ header_list = extract_json(gen_text=gen_text["response"])
208
+ units = []
209
+ start = 0
210
+ prev_end = 0
211
+ for header in header_list:
212
+ if "anchor_text" not in header:
213
+ Warning.warn(f"Missing anchor_text in header: {header}. Skipping this header.")
214
+ continue
215
+ if not isinstance(header["anchor_text"], str):
216
+ Warning.warn(f"Invalid anchor_text: {header['anchor_text']}. Skipping this header.")
217
+ continue
218
+
219
+ start = prev_end
220
+ # find the first instandce of the leading sentence in the rest of the text
221
+ end = text.find(header["anchor_text"], start)
222
+ # if not found, skip this header
223
+ if end == -1:
224
+ continue
225
+ # if start == end (empty text), skip this header
226
+ if start == end:
227
+ continue
228
+ # create a frame extraction unit
229
+ units.append(FrameExtractionUnit(
230
+ doc_id=doc_id,
231
+ start=start,
232
+ end=end,
233
+ text=text[start:end]
234
+ ))
235
+ prev_end = end
236
+ # add the last section
237
+ if prev_end < len(text):
238
+ units.append(FrameExtractionUnit(
239
+ doc_id=doc_id,
240
+ start=prev_end,
241
+ end=len(text),
242
+ text=text[prev_end:]
243
+ ))
244
+ return units
245
+
132
246
 
133
247
  class ContextChunker(abc.ABC):
134
248
  def __init__(self):
@@ -138,6 +252,24 @@ class ContextChunker(abc.ABC):
138
252
  """
139
253
  pass
140
254
 
255
+ @abc.abstractmethod
256
+ def fit(self, text:str, units:List[FrameExtractionUnit]):
257
+ """
258
+ Parameters:
259
+ ----------
260
+ text : str
261
+ The document text.
262
+ """
263
+ pass
264
+
265
+ async def fit_async(self, text:str, units:List[FrameExtractionUnit], executor=None):
266
+ """
267
+ asynchronous version of fit method.
268
+ """
269
+ loop = asyncio.get_running_loop()
270
+ return await loop.run_in_executor(executor, self.fit, text, units)
271
+
272
+ @abc.abstractmethod
141
273
  def chunk(self, unit:FrameExtractionUnit) -> str:
142
274
  """
143
275
  Parameters:
@@ -150,6 +282,13 @@ class ContextChunker(abc.ABC):
150
282
  """
151
283
  return NotImplemented
152
284
 
285
+ async def chunk_async(self, unit:FrameExtractionUnit, executor=None) -> str:
286
+ """
287
+ asynchronous version of chunk method.
288
+ """
289
+ loop = asyncio.get_running_loop()
290
+ return await loop.run_in_executor(executor, self.chunk, unit)
291
+
153
292
 
154
293
  class NoContextChunker(ContextChunker):
155
294
  def __init__(self):
@@ -7,13 +7,15 @@ import json
7
7
 
8
8
  @dataclass
9
9
  class FrameExtractionUnit:
10
- def __init__(self, start:int, end:int, text:str):
10
+ def __init__(self, doc_id:str, start:int, end:int, text:str):
11
11
  """
12
12
  This class holds the unit text for frame extraction, for example, a sentence.
13
13
  FrameExtractor prompt it one at a time to extract frames.
14
14
 
15
15
  Parameters
16
16
  ----------
17
+ doc_id : str, Optional
18
+ document ID.
17
19
  start : int
18
20
  start character position of the unit text, relative to the whole document
19
21
  end : int
@@ -21,9 +23,28 @@ class FrameExtractionUnit:
21
23
  text : str
22
24
  the unit text. Should be the exact string by [start:end]
23
25
  """
26
+ self.doc_id = doc_id
24
27
  self.start = start
25
28
  self.end = end
26
29
  self.text = text
30
+ # status: "pending", "success", "fail"
31
+ self.status = "pending"
32
+ # generated text by LLM
33
+ self.gen_text = None
34
+
35
+ def get_status(self) -> str:
36
+ return self.status
37
+
38
+ def set_status(self, status:str):
39
+ if status not in {"pending", "success", "fail"}:
40
+ raise ValueError('status must be one of {"pending", "success", "fail"}.')
41
+ self.status = status
42
+
43
+ def get_generated_text(self) -> str:
44
+ return self.gen_text
45
+
46
+ def set_generated_text(self, gen_text:str):
47
+ self.gen_text = gen_text
27
48
 
28
49
  def __eq__(self, other):
29
50
  if not isinstance(other, FrameExtractionUnit):
@@ -39,43 +60,8 @@ class FrameExtractionUnit:
39
60
  return self.start < other.start
40
61
 
41
62
  def __repr__(self):
42
- return f"FrameExtractionUnit(start={self.start}, end={self.end}, text='{self.text[:100]}...')"
43
-
44
-
45
- @dataclass
46
- class FrameExtractionUnitResult:
47
- def __init__(self, start:int, end:int, text:str, gen_text:str):
48
- """
49
- This class holds the unit text for frame extraction, for example, a sentence.
50
- FrameExtractor prompt it one at a time to extract frames.
51
-
52
- Parameters
53
- ----------
54
- start : int
55
- start character position of the unit text, relative to the whole document
56
- end : int
57
- end character position of the unit text, relative to the whole document
58
- text : str
59
- the unit text. Should be the exact string by [start:end]
60
- gen_text : str
61
- the generated text by LLM (ideally) following '[{"entity_text": "xxx", "attr": {"key": "value"}}]' format. Does not contain spans (start/end).
62
- """
63
- self.start = start
64
- self.end = end
65
- self.text = text
66
- self.gen_text = gen_text
67
-
68
- def __eq__(self, other):
69
- if not isinstance(other, FrameExtractionUnit):
70
- return NotImplemented
71
- return (self.start == other.start and self.end == other.end and self.text == other.text and self.gen_text == other.gen_text)
63
+ return f"FrameExtractionUnit(doc_id={self.doc_id}, start={self.start}, end={self.end}, status={self.status}, text='{self.text[:100]}...')"
72
64
 
73
- def __hash__(self):
74
- return hash((self.start, self.end, self.text, self.gen_text))
75
-
76
- def __repr__(self):
77
- return f"FrameExtractionUnitResult(start={self.start}, end={self.end}, text='{self.text[:100]}...', gen_text='{self.gen_text[:100]}...')"
78
-
79
65
 
80
66
  @dataclass
81
67
  class LLMInformationExtractionFrame: