llm-ie 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py
CHANGED
|
@@ -5,10 +5,11 @@ import inspect
|
|
|
5
5
|
import importlib.resources
|
|
6
6
|
import warnings
|
|
7
7
|
import itertools
|
|
8
|
-
from typing import List, Dict, Tuple, Union, Callable
|
|
8
|
+
from typing import Set, List, Dict, Tuple, Union, Callable
|
|
9
9
|
from llm_ie.data_types import LLMInformationExtractionFrame, LLMInformationExtractionDocument
|
|
10
10
|
from llm_ie.engines import InferenceEngine
|
|
11
11
|
from colorama import Fore, Style
|
|
12
|
+
from nltk.tokenize import RegexpTokenizer
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class Extractor:
|
|
@@ -37,7 +38,7 @@ class Extractor:
|
|
|
37
38
|
This method returns the pre-defined prompt guideline for the extractor from the package asset.
|
|
38
39
|
"""
|
|
39
40
|
file_path = importlib.resources.files('llm_ie.asset.prompt_guide').joinpath(f"{cls.__name__}_prompt_guide.txt")
|
|
40
|
-
with open(file_path, 'r') as f:
|
|
41
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
41
42
|
return f.read()
|
|
42
43
|
|
|
43
44
|
|
|
@@ -139,9 +140,68 @@ class FrameExtractor(Extractor):
|
|
|
139
140
|
prompt_template=prompt_template,
|
|
140
141
|
system_prompt=system_prompt,
|
|
141
142
|
**kwrs)
|
|
142
|
-
|
|
143
|
+
self.tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _jaccard_score(self, s1:set, s2:set) -> float:
|
|
147
|
+
"""
|
|
148
|
+
This method calculates the Jaccard score between two sets of word tokens.
|
|
149
|
+
"""
|
|
150
|
+
return len(s1.intersection(s2)) / len(s1.union(s2))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _get_word_tokens(self, text) -> Tuple[List[str], List[Tuple[int]]]:
|
|
154
|
+
"""
|
|
155
|
+
This method tokenizes the input text into a list of word tokens and their spans.
|
|
156
|
+
"""
|
|
157
|
+
tokens = []
|
|
158
|
+
spans = []
|
|
159
|
+
for span in self.tokenizer.span_tokenize(text):
|
|
160
|
+
spans.append(span)
|
|
161
|
+
start, end = span
|
|
162
|
+
tokens.append(text[start:end])
|
|
163
|
+
return tokens, spans
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _get_closest_substring(self, text:str, pattern:str, buffer_size:float=0.2) -> Tuple[Tuple[int, int], float]:
|
|
167
|
+
"""
|
|
168
|
+
This method finds the closest (highest Jaccard score) substring in text that matches the pattern.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
text : str
|
|
173
|
+
the input text.
|
|
174
|
+
pattern : str
|
|
175
|
+
the pattern to match.
|
|
176
|
+
buffer_size : float, Optional
|
|
177
|
+
the buffer size for the matching window. Default is 20% of pattern length.
|
|
178
|
+
|
|
179
|
+
Returns : Tuple[Tuple[int, int], float]
|
|
180
|
+
a tuple of 2-tuple span and Jaccard score.
|
|
181
|
+
"""
|
|
182
|
+
text_tokens, text_spans = self._get_word_tokens(text)
|
|
183
|
+
pattern_tokens, _ = self._get_word_tokens(pattern)
|
|
184
|
+
pattern_tokens = set(pattern_tokens)
|
|
185
|
+
window_size = len(pattern_tokens)
|
|
186
|
+
window_size_min = int(window_size * (1 - buffer_size))
|
|
187
|
+
window_size_max = int(window_size * (1 + buffer_size))
|
|
188
|
+
closest_substring_spans = None
|
|
189
|
+
best_score = 0
|
|
190
|
+
|
|
191
|
+
for i in range(len(text_tokens) - window_size_max):
|
|
192
|
+
for w in range(window_size_min, window_size_max):
|
|
193
|
+
sub_str_tokens = set(text_tokens[i:i + w])
|
|
194
|
+
score = self._jaccard_score(sub_str_tokens, pattern_tokens)
|
|
195
|
+
if score > best_score:
|
|
196
|
+
best_score = score
|
|
197
|
+
sub_string_word_spans = text_spans[i:i + w]
|
|
198
|
+
closest_substring_spans = (sub_string_word_spans[0][0], sub_string_word_spans[-1][-1])
|
|
143
199
|
|
|
144
|
-
|
|
200
|
+
return closest_substring_spans, best_score
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
|
|
204
|
+
fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8) -> List[Tuple[int]]:
|
|
145
205
|
"""
|
|
146
206
|
This function inputs a text and a list of entity text,
|
|
147
207
|
outputs a list of spans (2-tuple) for each entity.
|
|
@@ -151,19 +211,46 @@ class FrameExtractor(Extractor):
|
|
|
151
211
|
----------
|
|
152
212
|
text : str
|
|
153
213
|
text that contains entities
|
|
214
|
+
entities : List[str]
|
|
215
|
+
a list of entity text to find in the text
|
|
216
|
+
case_sensitive : bool, Optional
|
|
217
|
+
if True, entity text matching will be case-sensitive.
|
|
218
|
+
fuzzy_match : bool, Optional
|
|
219
|
+
if True, fuzzy matching will be applied to find entity text.
|
|
220
|
+
fuzzy_buffer_size : float, Optional
|
|
221
|
+
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
222
|
+
fuzzy_score_cutoff : float, Optional
|
|
223
|
+
the Jaccard score cutoff for fuzzy matching.
|
|
224
|
+
Matched entity text must have a score higher than this value or a None will be returned.
|
|
154
225
|
"""
|
|
226
|
+
# Handle case sensitivity
|
|
227
|
+
if not case_sensitive:
|
|
228
|
+
text = text.lower()
|
|
229
|
+
|
|
230
|
+
# Match entities
|
|
155
231
|
entity_spans = []
|
|
156
|
-
for entity in entities:
|
|
157
|
-
if case_sensitive:
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
232
|
+
for entity in entities:
|
|
233
|
+
if not case_sensitive:
|
|
234
|
+
entity = entity.lower()
|
|
235
|
+
|
|
236
|
+
# Exact match
|
|
237
|
+
match = re.search(re.escape(entity), text)
|
|
162
238
|
if match:
|
|
163
239
|
start, end = match.span()
|
|
164
240
|
entity_spans.append((start, end))
|
|
165
241
|
# Replace the found entity with spaces to avoid finding the same instance again
|
|
166
242
|
text = text[:start] + ' ' * (end - start) + text[end:]
|
|
243
|
+
# Fuzzy match
|
|
244
|
+
elif fuzzy_match:
|
|
245
|
+
closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
|
|
246
|
+
if best_score >= fuzzy_score_cutoff:
|
|
247
|
+
entity_spans.append(closest_substring_span)
|
|
248
|
+
# Replace the found entity with spaces to avoid finding the same instance again
|
|
249
|
+
text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
|
|
250
|
+
else:
|
|
251
|
+
entity_spans.append(None)
|
|
252
|
+
|
|
253
|
+
# No match
|
|
167
254
|
else:
|
|
168
255
|
entity_spans.append(None)
|
|
169
256
|
|
|
@@ -276,7 +363,9 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
276
363
|
|
|
277
364
|
|
|
278
365
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
|
|
279
|
-
temperature:float=0.0,
|
|
366
|
+
temperature:float=0.0, document_key:str=None, stream:bool=False,
|
|
367
|
+
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
|
|
368
|
+
fuzzy_score_cutoff:float=0.8, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
280
369
|
"""
|
|
281
370
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
282
371
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -293,18 +382,30 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
293
382
|
the max number of new tokens LLM should generate.
|
|
294
383
|
temperature : float, Optional
|
|
295
384
|
the temperature for token sampling.
|
|
296
|
-
case_sensitive : bool, Optional
|
|
297
|
-
if True, entity text matching will be case-sensitive.
|
|
298
385
|
document_key : str, Optional
|
|
299
386
|
specify the key in text_content where document text is.
|
|
300
387
|
If text_content is str, this parameter will be ignored.
|
|
388
|
+
stream : bool, Optional
|
|
389
|
+
if True, LLM generated text will be printed in terminal in real-time.
|
|
390
|
+
case_sensitive : bool, Optional
|
|
391
|
+
if True, entity text matching will be case-sensitive.
|
|
392
|
+
fuzzy_match : bool, Optional
|
|
393
|
+
if True, fuzzy matching will be applied to find entity text.
|
|
394
|
+
fuzzy_buffer_size : float, Optional
|
|
395
|
+
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
396
|
+
fuzzy_score_cutoff : float, Optional
|
|
397
|
+
the Jaccard score cutoff for fuzzy matching.
|
|
398
|
+
Matched entity text must have a score higher than this value or a None will be returned.
|
|
301
399
|
|
|
302
400
|
Return : str
|
|
303
401
|
a list of frames.
|
|
304
402
|
"""
|
|
305
403
|
frame_list = []
|
|
306
404
|
gen_text = self.extract(text_content=text_content,
|
|
307
|
-
max_new_tokens=max_new_tokens,
|
|
405
|
+
max_new_tokens=max_new_tokens,
|
|
406
|
+
temperature=temperature,
|
|
407
|
+
stream=stream,
|
|
408
|
+
**kwrs)
|
|
308
409
|
|
|
309
410
|
entity_json = []
|
|
310
411
|
for entity in self._extract_json(gen_text=gen_text):
|
|
@@ -320,7 +421,10 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
320
421
|
|
|
321
422
|
spans = self._find_entity_spans(text=text,
|
|
322
423
|
entities=[e[entity_key] for e in entity_json],
|
|
323
|
-
case_sensitive=case_sensitive
|
|
424
|
+
case_sensitive=case_sensitive,
|
|
425
|
+
fuzzy_match=fuzzy_match,
|
|
426
|
+
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
427
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff)
|
|
324
428
|
|
|
325
429
|
for i, (ent, span) in enumerate(zip(entity_json, spans)):
|
|
326
430
|
if span is not None:
|
|
@@ -370,7 +474,7 @@ class ReviewFrameExtractor(BasicFrameExtractor):
|
|
|
370
474
|
else:
|
|
371
475
|
file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
|
|
372
476
|
joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
|
|
373
|
-
with open(file_path, 'r') as f:
|
|
477
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
374
478
|
self.review_prompt = f.read()
|
|
375
479
|
|
|
376
480
|
warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
|
|
@@ -559,8 +663,9 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
559
663
|
|
|
560
664
|
|
|
561
665
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=512,
|
|
562
|
-
document_key:str=None, multi_turn:bool=False, temperature:float=0.0,
|
|
563
|
-
|
|
666
|
+
document_key:str=None, multi_turn:bool=False, temperature:float=0.0, stream:bool=False,
|
|
667
|
+
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
668
|
+
**kwrs) -> List[LLMInformationExtractionFrame]:
|
|
564
669
|
"""
|
|
565
670
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
566
671
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -586,10 +691,17 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
586
691
|
can better utilize the KV caching.
|
|
587
692
|
temperature : float, Optional
|
|
588
693
|
the temperature for token sampling.
|
|
589
|
-
case_sensitive : bool, Optional
|
|
590
|
-
if True, entity text matching will be case-sensitive.
|
|
591
694
|
stream : bool, Optional
|
|
592
695
|
if True, LLM generated text will be printed in terminal in real-time.
|
|
696
|
+
case_sensitive : bool, Optional
|
|
697
|
+
if True, entity text matching will be case-sensitive.
|
|
698
|
+
fuzzy_match : bool, Optional
|
|
699
|
+
if True, fuzzy matching will be applied to find entity text.
|
|
700
|
+
fuzzy_buffer_size : float, Optional
|
|
701
|
+
the buffer size for fuzzy matching. Default is 20% of entity text length.
|
|
702
|
+
fuzzy_score_cutoff : float, Optional
|
|
703
|
+
the Jaccard score cutoff for fuzzy matching.
|
|
704
|
+
Matched entity text must have a score higher than this value or a None will be returned.
|
|
593
705
|
|
|
594
706
|
Return : str
|
|
595
707
|
a list of frames.
|
|
@@ -611,7 +723,11 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
611
723
|
warnings.warn(f'Extractor output "{entity}" does not have entity_key ("{entity_key}"). This frame will be dropped.', RuntimeWarning)
|
|
612
724
|
|
|
613
725
|
spans = self._find_entity_spans(text=sent['sentence_text'],
|
|
614
|
-
entities=[e[entity_key] for e in entity_json],
|
|
726
|
+
entities=[e[entity_key] for e in entity_json],
|
|
727
|
+
case_sensitive=case_sensitive,
|
|
728
|
+
fuzzy_match=fuzzy_match,
|
|
729
|
+
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
730
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff)
|
|
615
731
|
for ent, span in zip(entity_json, spans):
|
|
616
732
|
if span is not None:
|
|
617
733
|
start, end = span
|
|
@@ -663,7 +779,7 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
663
779
|
else:
|
|
664
780
|
file_path = importlib.resources.files('llm_ie.asset.default_prompts').\
|
|
665
781
|
joinpath(f"{self.__class__.__name__}_{self.review_mode}_review_prompt.txt")
|
|
666
|
-
with open(file_path, 'r') as f:
|
|
782
|
+
with open(file_path, 'r', encoding="utf-8") as f:
|
|
667
783
|
self.review_prompt = f.read()
|
|
668
784
|
|
|
669
785
|
warnings.warn(f'Custom review prompt not provided. The default review prompt is used:\n"{self.review_prompt}"', UserWarning)
|
llm_ie/prompt_editor.py
CHANGED
|
@@ -5,8 +5,6 @@ from llm_ie.engines import InferenceEngine
|
|
|
5
5
|
from llm_ie.extractors import FrameExtractor
|
|
6
6
|
import re
|
|
7
7
|
from colorama import Fore, Style
|
|
8
|
-
import ipywidgets as widgets
|
|
9
|
-
from IPython.display import display, HTML
|
|
10
8
|
|
|
11
9
|
|
|
12
10
|
class PromptEditor:
|
|
@@ -121,6 +119,16 @@ class PromptEditor:
|
|
|
121
119
|
"""
|
|
122
120
|
This method runs an interactive chat session in Jupyter/IPython using ipywidgets to help users write prompt templates.
|
|
123
121
|
"""
|
|
122
|
+
# Check if ipywidgets is installed
|
|
123
|
+
if importlib.util.find_spec("ipywidgets") is None:
|
|
124
|
+
raise ImportError("ipywidgets not found. Please install ipywidgets (```pip install ipywidgets```).")
|
|
125
|
+
import ipywidgets as widgets
|
|
126
|
+
|
|
127
|
+
# Check if IPython is installed
|
|
128
|
+
if importlib.util.find_spec("IPython") is None:
|
|
129
|
+
raise ImportError("IPython not found. Please install IPython (```pip install ipython```).")
|
|
130
|
+
from IPython.display import display, HTML
|
|
131
|
+
|
|
124
132
|
# Load the chat prompt template from the resources
|
|
125
133
|
file_path = importlib.resources.files('llm_ie.asset.PromptEditor_prompts').joinpath('chat.txt')
|
|
126
134
|
with open(file_path, 'r') as f:
|
|
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=oKH_QeD
|
|
|
16
16
|
llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=oKH_QeDgpw771ZdHk3L7DYz2Jvfm7OolUoTiJyMJI30,9541
|
|
17
17
|
llm_ie/data_types.py,sha256=hPz3WOeAzfn2QKmb0CxHmRdQWZQ4G9zq8U-RJBVFdYk,14329
|
|
18
18
|
llm_ie/engines.py,sha256=PTYs7s_iCPmI-yFUCVCPY_cMGS77ma2VGoz4rdNkODI,9308
|
|
19
|
-
llm_ie/extractors.py,sha256=
|
|
20
|
-
llm_ie/prompt_editor.py,sha256=
|
|
21
|
-
llm_ie-0.3.
|
|
22
|
-
llm_ie-0.3.
|
|
23
|
-
llm_ie-0.3.
|
|
19
|
+
llm_ie/extractors.py,sha256=bZQh_qZP1yIwNLXIx1ZzVGN702q3qzrlwiTcHuMsJt0,64051
|
|
20
|
+
llm_ie/prompt_editor.py,sha256=Xc5ZHsEnM8-YYITokIsM6BVsf2Ec_8ajJDaldPf-P8U,8577
|
|
21
|
+
llm_ie-0.3.2.dist-info/METADATA,sha256=Lpzaq0n-kpfiIpzic3qUQncVDoCpfw6CGmxc5NftXro,41266
|
|
22
|
+
llm_ie-0.3.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
23
|
+
llm_ie-0.3.2.dist-info/RECORD,,
|
|
File without changes
|