llm-ie 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py
CHANGED
|
@@ -5,10 +5,11 @@ import inspect
|
|
|
5
5
|
import importlib.resources
|
|
6
6
|
import warnings
|
|
7
7
|
import itertools
|
|
8
|
-
from typing import List, Dict, Tuple, Union, Callable
|
|
8
|
+
from typing import Set, List, Dict, Tuple, Union, Callable
|
|
9
9
|
from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
10
10
|
from llm_ie.engines import InferenceEngine
|
|
11
11
|
from colorama import Fore, Style
|
|
12
|
+
from nltk.tokenize import RegexpTokenizer
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class Extractor:
|
|
@@ -37,7 +38,7 @@ class Extractor:
|
|
|
37
38
|
This method returns the pre-defined prompt guideline for the extractor from the package asset.
|
|
38
39
|
"""
|
|
39
40
|
file_path = importlib.resources.files('llm_ie.asset.prompt_guide').joinpath(f"{cls.__name__}_prompt_guide.txt")
|
|
40
|
-
with open(file_path, 'r') as f:
|
|
41
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
41
42
|
return f.read()
|
|
42
43
|
|
|
43
44
|
|
|
@@ -139,9 +140,71 @@ class FrameExtractor(Extractor):
|
|
|
139
140
|
prompt_template=prompt_template,
|
|
140
141
|
system_prompt=system_prompt,
|
|
141
142
|
**kwrs)
|
|
142
|
-
|
|
143
|
+
self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _jaccard_score(self, s1:Set[str], s2:Set[str]) -> float:
|
|
147
|
+
"""
|
|
148
|
+
This method calculates the Jaccard score between two sets of word tokens.
|
|
149
|
+
"""
|
|
150
|
+
return len(s1.intersection(s2)) / len(s1.union(s2))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _get_word_tokens(self, text) -> Tuple[List[str], List[Tuple[int]]]:
|
|
154
|
+
"""
|
|
155
|
+
This method tokenizes the input text into a list of word tokens and their spans.
|
|
156
|
+
"""
|
|
157
|
+
tokens = []
|
|
158
|
+
spans = []
|
|
159
|
+
for span in self.tokenizer.span_tokenize(text):
|
|
160
|
+
spans.append(span)
|
|
161
|
+
start, end = span
|
|
162
|
+
tokens.append(text[start:end])
|
|
163
|
+
return tokens, spans
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
|
|
167
|
+
"""
|
|
168
|
+
This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
|
|
169
|
+
the substring must start with the same word token as the pattern. This is due to the observation that
|
|
170
|
+
LLM often generate the first few words consistently.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
text : str
|
|
175
|
+
the input text.
|
|
176
|
+
pattern : str
|
|
177
|
+
the pattern to match.
|
|
178
|
+
buffer_size : float, Optional
|
|
179
|
+
the buffer size for the matching window. Default is 20% of pattern length.
|
|
180
|
+
|
|
181
|
+
Returns : Tuple[Tuple[int, int], float]
|
|
182
|
+
a tuple of 2-tuple span and Jaccard score.
|
|
183
|
+
"""
|
|
184
|
+
text_tokens, text_spans = self._get_word_tokens(text)
|
|
185
|
+
pattern_tokens, _ = self._get_word_tokens(pattern)
|
|
186
|
+
pattern_tokens_set = set(pattern_tokens)
|
|
187
|
+
window_size = len(pattern_tokens)
|
|
188
|
+
window_size_min = int(window_size * (1 - buffer_size))
|
|
189
|
+
window_size_max = int(window_size * (1 + buffer_size))
|
|
190
|
+
closest_substring_span = None
|
|
191
|
+
best_score = 0
|
|
192
|
+
|
|
193
|
+
for i in range(len(text_tokens) - window_size_max):
|
|
194
|
+
for w in range(window_size_min, window_size_max):
|
|
195
|
+
sub_str_tokens = text_tokens[i:i + w]
|
|
196
|
+
if sub_str_tokens[0] == pattern_tokens[0]:
|
|
197
|
+
score = self._jaccard_score(set(sub_str_tokens), pattern_tokens_set)
|
|
198
|
+
if score > best_score:
|
|
199
|
+
best_score = score
|
|
200
|
+
sub_string_word_spans = text_spans[i:i + w]
|
|
201
|
+
closest_substring_span = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
|
|
143
202
|
|
|
144
|
-
|
|
203
|
+
return closest_substring_span, best_score
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
|
|
207
|
+
fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8) -> List[Tuple[int]]:
|
|
145
208
|
"""
|
|
146
209
|
This function inputs a text and a list of entity text,
|
|
147
210
|
outputs a list of spans (2-tuple) for each entity.
|
|
@@ -151,19 +214,46 @@ class FrameExtractor(Extractor):
|
|
|
151
214
|
----------
|
|
152
215
|
text : str
|
|
153
216
|
text that contains entities
|
|
217
|
+
entities : List[str]
|
|
218
|
+
a list of entity text to find in the text
|
|
219
|
+
case_sensitive : bool, Optional
|
|
220
|
+
if True, entity text matching will be case-sensitive.
|
|
221
|
+
fuzzy_match : bool, Optional
|
|
222
|
+
if True, fuzzy matching will be applied to find entity text.
|
|
223
|
+
fuzzy_buffer_size : float, Optional
|
|
224
|
+
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
225
|
+
fuzzy_score_cutoff : float, Optional
|
|
226
|
+
the Jaccard score cutoff for fuzzy matching.
|
|
227
|
+
Matched entity text must have a score higher than this value or a None will be returned.
|
|
154
228
|
"""
|
|
229
|
+
# Handle case sensitivity
|
|
230
|
+
if not case_sensitive:
|
|
231
|
+
text = text.lower()
|
|
232
|
+
|
|
233
|
+
# Match entities
|
|
155
234
|
entity_spans = []
|
|
156
|
-
for entity in entities:
|
|
157
|
-
if case_sensitive:
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
235
|
+
for entity in entities:
|
|
236
|
+
if not case_sensitive:
|
|
237
|
+
entity = entity.lower()
|
|
238
|
+
|
|
239
|
+
# Exact match
|
|
240
|
+
match = re.search(re.escape(entity), text)
|
|
162
241
|
if match:
|
|
163
242
|
start, end = match.span()
|
|
164
243
|
entity_spans.append((start, end))
|
|
165
244
|
# Replace the found entity with spaces to avoid finding the same instance again
|
|
166
245
|
text = text[:start] + ' ' * (end - start) + text[end:]
|
|
246
|
+
# Fuzzy match
|
|
247
|
+
elif fuzzy_match:
|
|
248
|
+
closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
|
|
249
|
+
if best_score >= fuzzy_score_cutoff and closest_substring_span:
|
|
250
|
+
entity_spans.append(closest_substring_span)
|
|
251
|
+
# Replace the found entity with spaces to avoid finding the same instance again
|
|
252
|
+
text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
|
|
253
|
+
else:
|
|
254
|
+
entity_spans.append(None)
|
|
255
|
+
|
|
256
|
+
# No match
|
|
167
257
|
else:
|
|
168
258
|
entity_spans.append(None)
|
|
169
259
|
|
|
@@ -276,7 +366,9 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
276
366
|
|
|
277
367
|
|
|
278
368
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
|
|
279
|
-
temperature:float=0.0,
|
|
369
|
+
temperature:float=0.0, document_key:str=None, stream:bool=False,
|
|
370
|
+
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
|
|
371
|
+
fuzzy_score_cutoff:float=0.8, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
280
372
|
"""
|
|
281
373
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
282
374
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -293,18 +385,30 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
293
385
|
the max number of new tokens LLM should generate.
|
|
294
386
|
temperature : float, Optional
|
|
295
387
|
the temperature for token sampling.
|
|
296
|
-
case_sensitive : bool, Optional
|
|
297
|
-
if True, entity text matching will be case-sensitive.
|
|
298
388
|
document_key : str, Optional
|
|
299
389
|
specify the key in text_content where document text is.
|
|
300
390
|
If text_content is str, this parameter will be ignored.
|
|
391
|
+
stream : bool, Optional
|
|
392
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
393
|
+
case_sensitive : bool, Optional
|
|
394
|
+
if True, entity text matching will be case-sensitive.
|
|
395
|
+
fuzzy_match : bool, Optional
|
|
396
|
+
if True, fuzzy matching will be applied to find entity text.
|
|
397
|
+
fuzzy_buffer_size : float, Optional
|
|
398
|
+
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
399
|
+
fuzzy_score_cutoff : float, Optional
|
|
400
|
+
the Jaccard score cutoff for fuzzy matching.
|
|
401
|
+
Matched entity text must have a score higher than this value or a None will be returned.
|
|
301
402
|
|
|
302
403
|
Return : str
|
|
303
404
|
a list of frames.
|
|
304
405
|
"""
|
|
305
406
|
frame_list = []
|
|
306
407
|
gen_text = self.extract(text_content=text_content,
|
|
307
|
-
max_new_tokens=max_new_tokens,
|
|
408
|
+
max_new_tokens=max_new_tokens,
|
|
409
|
+
temperature=temperature,
|
|
410
|
+
stream=stream,
|
|
411
|
+
**kwrs)
|
|
308
412
|
|
|
309
413
|
entity_json = []
|
|
310
414
|
for entity in self._extract_json(gen_text=gen_text):
|
|
@@ -320,7 +424,10 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
320
424
|
|
|
321
425
|
spans = self._find_entity_spans(text=text,
|
|
322
426
|
entities=[e[entity_key] for e in entity_json],
|
|
323
|
-
case_sensitive=case_sensitive
|
|
427
|
+
case_sensitive=case_sensitive,
|
|
428
|
+
fuzzy_match=fuzzy_match,
|
|
429
|
+
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
430
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff)
|
|
324
431
|
|
|
325
432
|
for i, (ent, span) in enumerate(zip(entity_json, spans)):
|
|
326
433
|
if span is not None:
|
|
@@ -328,7 +435,7 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
328
435
|
frame = LLMInformationExtractionFrame(frame_id=f"{i}",
|
|
329
436
|
start=start,
|
|
330
437
|
end=end,
|
|
331
|
-
entity_text=
|
|
438
|
+
entity_text=text[start:end],
|
|
332
439
|
attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
|
|
333
440
|
frame_list.append(frame)
|
|
334
441
|
return frame_list
|
|
@@ -370,7 +477,7 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
370
477
|
else:
|
|
371
478
|
file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
|
|
372
479
|
joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
|
|
373
|
-
with open(file_path, 'r') as f:
|
|
480
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
374
481
|
self.review_prompt = f.read()
|
|
375
482
|
|
|
376
483
|
warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
|
|
@@ -559,8 +666,9 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
559
666
|
|
|
560
667
|
|
|
561
668
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
|
|
562
|
-
document_key:str=None, multi_turn:bool=False, temperature:float=0.0,
|
|
563
|
-
|
|
669
|
+
document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False,
|
|
670
|
+
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
671
|
+
**kwrs) -> List[LLMInformationExtractionFrame]:
|
|
564
672
|
"""
|
|
565
673
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
566
674
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -586,10 +694,17 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
586
694
|
can better utilize the KV caching.
|
|
587
695
|
temperature : float, Optional
|
|
588
696
|
the temperature for token sampling.
|
|
589
|
-
case_sensitive : bool, Optional
|
|
590
|
-
if True, entity text matching will be case-sensitive.
|
|
591
697
|
stream : bool, Optional
|
|
592
698
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
699
|
+
case_sensitive : bool, Optional
|
|
700
|
+
if True, entity text matching will be case-sensitive.
|
|
701
|
+
fuzzy_match : bool, Optional
|
|
702
|
+
if True, fuzzy matching will be applied to find entity text.
|
|
703
|
+
fuzzy_buffer_size : float, Optional
|
|
704
|
+
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
705
|
+
fuzzy_score_cutoff : float, Optional
|
|
706
|
+
the Jaccard score cutoff for fuzzy matching.
|
|
707
|
+
Matched entity text must have a score higher than this value or a None will be returned.
|
|
593
708
|
|
|
594
709
|
Return : str
|
|
595
710
|
a list of frames.
|
|
@@ -611,16 +726,21 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
611
726
|
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
|
|
612
727
|
|
|
613
728
|
spans = self._find_entity_spans(text=sent['sentence_text'],
|
|
614
|
-
entities=[e[entity_key] for e in entity_json],
|
|
729
|
+
entities=[e[entity_key] for e in entity_json],
|
|
730
|
+
case_sensitive=case_sensitive,
|
|
731
|
+
fuzzy_match=fuzzy_match,
|
|
732
|
+
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
733
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff)
|
|
615
734
|
for ent, span in zip(entity_json, spans):
|
|
616
735
|
if span is not None:
|
|
617
736
|
start, end = span
|
|
737
|
+
entity_text = sent['sentence_text'][start:end]
|
|
618
738
|
start += sent['sentence_start']
|
|
619
739
|
end += sent['sentence_start']
|
|
620
740
|
frame = LLMInformationExtractionFrame(frame_id=f"{len(frame_list)}",
|
|
621
741
|
start=start,
|
|
622
742
|
end=end,
|
|
623
|
-
entity_text=
|
|
743
|
+
entity_text=entity_text,
|
|
624
744
|
attr={k: v for k, v in ent.items() if k != entity_key and v != ""})
|
|
625
745
|
frame_list.append(frame)
|
|
626
746
|
return frame_list
|
|
@@ -663,7 +783,7 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
663
783
|
else:
|
|
664
784
|
file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
|
|
665
785
|
joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
|
|
666
|
-
with open(file_path, 'r') as f:
|
|
786
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
667
787
|
self.review_prompt = f.read()
|
|
668
788
|
|
|
669
789
|
warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
|
llm_ie/prompt_editor.py
CHANGED
|
@@ -5,8 +5,6 @@ from llm_ie.engines import InferenceEngine
|
|
|
5
5
|
from llm_ie.extractors import FrameExtractor
|
|
6
6
|
import re
|
|
7
7
|
from colorama import Fore, Style
|
|
8
|
-
import ipywidgets as widgets
|
|
9
|
-
from IPython.display import display, HTML
|
|
10
8
|
|
|
11
9
|
|
|
12
10
|
class PromptEditor:
|
|
@@ -121,6 +119,16 @@ class PromptEditor:
|
|
|
121
119
|
"""
|
|
122
120
|
This method runs an interactive chat session in Jupyter/IPython using ipywidgets to help users write prompt templates.
|
|
123
121
|
"""
|
|
122
|
+
# Check if ipywidgets is installed
|
|
123
|
+
if importlib.util.find_spec("ipywidgets") is None:
|
|
124
|
+
raise ImportError("ipywidgets not found. Please install ipywidgets (```pip install ipywidgets```).")
|
|
125
|
+
import ipywidgets as widgets
|
|
126
|
+
|
|
127
|
+
# Check if IPython is installed
|
|
128
|
+
if importlib.util.find_spec("IPython") is None:
|
|
129
|
+
raise ImportError("IPython not found. Please install IPython (```pip install ipython```).")
|
|
130
|
+
from IPython.display import display, HTML
|
|
131
|
+
|
|
124
132
|
# Load the chat prompt template from the resources
|
|
125
133
|
file_path = importlib.resources.files('llm_ie.asset.PromptEditor_prompts').joinpath('chat.txt')
|
|
126
134
|
with open(file_path, 'r') as f:
|
|
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
|
|
|
16
16
|
llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
|
|
17
17
|
llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
|
|
18
18
|
llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
|
|
19
|
-
llm_ie/extractors.py,sha256=
|
|
20
|
-
llm_ie/prompt_editor.py,sha256=
|
|
21
|
-
llm_ie-0.3.
|
|
22
|
-
llm_ie-0.3.
|
|
23
|
-
llm_ie-0.3.
|
|
19
|
+
llm_ie/extractors.py,sha256=yBdIcevjMfwto85Jb0KkRMN-AjIMk92fD5yWB3Qm8MY,64408
|
|
20
|
+
llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
|
|
21
|
+
llm_ie-0.3.3.dist-info/METADATA,sha256=CeTsMNtWhEWCvOqHWSXu0KqOgDp3kMwN2WtBF4N-4zE,41266
|
|
22
|
+
llm_ie-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
23
|
+
llm_ie-0.3.3.dist-info/RECORD,,
|
|
File without changes
|