llm-ie 0.4.5__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_ie/extractors.py
CHANGED
|
@@ -224,7 +224,8 @@ class FrameExtractor(Extractor):
|
|
|
224
224
|
|
|
225
225
|
|
|
226
226
|
def _find_entity_spans(self, text: str, entities: List[str], case_sensitive:bool=False,
|
|
227
|
-
fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8
|
|
227
|
+
fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
228
|
+
allow_overlap_entities:bool=False) -> List[Tuple[int]]:
|
|
228
229
|
"""
|
|
229
230
|
This function inputs a text and a list of entity text,
|
|
230
231
|
outputs a list of spans (2-tuple) for each entity.
|
|
@@ -245,6 +246,8 @@ class FrameExtractor(Extractor):
|
|
|
245
246
|
fuzzy_score_cutoff : float, Optional
|
|
246
247
|
the Jaccard score cutoff for fuzzy matching.
|
|
247
248
|
Matched entity text must have a score higher than this value or a None will be returned.
|
|
249
|
+
allow_overlap_entities : bool, Optional
|
|
250
|
+
if True, entities can overlap in the text.
|
|
248
251
|
"""
|
|
249
252
|
# Handle case sensitivity
|
|
250
253
|
if not case_sensitive:
|
|
@@ -264,15 +267,17 @@ class FrameExtractor(Extractor):
|
|
|
264
267
|
if match and entity:
|
|
265
268
|
start, end = match.span()
|
|
266
269
|
entity_spans.append((start, end))
|
|
267
|
-
|
|
268
|
-
|
|
270
|
+
if not allow_overlap_entities:
|
|
271
|
+
# Replace the found entity with spaces to avoid finding the same instance again
|
|
272
|
+
text = text[:start] + ' ' * (end - start) + text[end:]
|
|
269
273
|
# Fuzzy match
|
|
270
274
|
elif fuzzy_match:
|
|
271
275
|
closest_substring_span, best_score = self._get_closest_substring(text, entity, buffer_size=fuzzy_buffer_size)
|
|
272
276
|
if closest_substring_span and best_score >= fuzzy_score_cutoff:
|
|
273
277
|
entity_spans.append(closest_substring_span)
|
|
274
|
-
|
|
275
|
-
|
|
278
|
+
if not allow_overlap_entities:
|
|
279
|
+
# Replace the found entity with spaces to avoid finding the same instance again
|
|
280
|
+
text = text[:closest_substring_span[0]] + ' ' * (closest_substring_span[1] - closest_substring_span[0]) + text[closest_substring_span[1]:]
|
|
276
281
|
else:
|
|
277
282
|
entity_spans.append(None)
|
|
278
283
|
|
|
@@ -391,7 +396,7 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
391
396
|
def extract_frames(self, text_content:Union[str, Dict[str,str]], entity_key:str, max_new_tokens:int=2048,
|
|
392
397
|
temperature:float=0.0, document_key:str=None, stream:bool=False,
|
|
393
398
|
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2,
|
|
394
|
-
fuzzy_score_cutoff:float=0.8, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
399
|
+
fuzzy_score_cutoff:float=0.8, allow_overlap_entities:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
395
400
|
"""
|
|
396
401
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
397
402
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -422,6 +427,9 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
422
427
|
fuzzy_score_cutoff : float, Optional
|
|
423
428
|
the Jaccard score cutoff for fuzzy matching.
|
|
424
429
|
Matched entity text must have a score higher than this value or a None will be returned.
|
|
430
|
+
allow_overlap_entities : bool, Optional
|
|
431
|
+
if True, entities can overlap in the text.
|
|
432
|
+
Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
|
|
425
433
|
|
|
426
434
|
Return : str
|
|
427
435
|
a list of frames.
|
|
@@ -452,7 +460,8 @@ class BasicFrameExtractor(FrameExtractor):
|
|
|
452
460
|
case_sensitive=case_sensitive,
|
|
453
461
|
fuzzy_match=fuzzy_match,
|
|
454
462
|
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
455
|
-
fuzzy_score_cutoff=fuzzy_score_cutoff
|
|
463
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff,
|
|
464
|
+
allow_overlap_entities=allow_overlap_entities)
|
|
456
465
|
|
|
457
466
|
for i, (ent, span) in enumerate(zip(entity_json, spans)):
|
|
458
467
|
if span is not None:
|
|
@@ -761,8 +770,8 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
761
770
|
sentences = self._get_sentences(text_content[document_key])
|
|
762
771
|
|
|
763
772
|
# generate sentence by sentence
|
|
764
|
-
tasks = []
|
|
765
773
|
for i in range(0, len(sentences), concurrent_batch_size):
|
|
774
|
+
tasks = []
|
|
766
775
|
batch = sentences[i:i + concurrent_batch_size]
|
|
767
776
|
for j, sent in enumerate(batch):
|
|
768
777
|
# construct chat messages
|
|
@@ -797,12 +806,12 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
797
806
|
# Wait until the batch is done, collect results and move on to next batch
|
|
798
807
|
responses = await asyncio.gather(*tasks)
|
|
799
808
|
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
809
|
+
# Collect outputs
|
|
810
|
+
for gen_text, sent in zip(responses, batch):
|
|
811
|
+
output.append({'sentence_start': sent['start'],
|
|
812
|
+
'sentence_end': sent['end'],
|
|
813
|
+
'sentence_text': sent['sentence_text'],
|
|
814
|
+
'gen_text': gen_text})
|
|
806
815
|
return output
|
|
807
816
|
|
|
808
817
|
|
|
@@ -810,7 +819,7 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
810
819
|
document_key:str=None, temperature:float=0.0, stream:bool=False,
|
|
811
820
|
concurrent:bool=False, concurrent_batch_size:int=32,
|
|
812
821
|
case_sensitive:bool=False, fuzzy_match:bool=True, fuzzy_buffer_size:float=0.2, fuzzy_score_cutoff:float=0.8,
|
|
813
|
-
**kwrs) -> List[LLMInformationExtractionFrame]:
|
|
822
|
+
allow_overlap_entities:bool=False, **kwrs) -> List[LLMInformationExtractionFrame]:
|
|
814
823
|
"""
|
|
815
824
|
This method inputs a text and outputs a list of LLMInformationExtractionFrame
|
|
816
825
|
It use the extract() method and post-process outputs into frames.
|
|
@@ -845,6 +854,9 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
845
854
|
fuzzy_score_cutoff : float, Optional
|
|
846
855
|
the Jaccard score cutoff for fuzzy matching.
|
|
847
856
|
Matched entity text must have a score higher than this value or a None will be returned.
|
|
857
|
+
allow_overlap_entities : bool, Optional
|
|
858
|
+
if True, entities can overlap in the text.
|
|
859
|
+
Note that this can cause multiple frames to be generated on the same entity span if they have same entity text.
|
|
848
860
|
|
|
849
861
|
Return : str
|
|
850
862
|
a list of frames.
|
|
@@ -882,7 +894,8 @@ class SentenceFrameExtractor(FrameExtractor):
|
|
|
882
894
|
case_sensitive=case_sensitive,
|
|
883
895
|
fuzzy_match=fuzzy_match,
|
|
884
896
|
fuzzy_buffer_size=fuzzy_buffer_size,
|
|
885
|
-
fuzzy_score_cutoff=fuzzy_score_cutoff
|
|
897
|
+
fuzzy_score_cutoff=fuzzy_score_cutoff,
|
|
898
|
+
allow_overlap_entities=allow_overlap_entities)
|
|
886
899
|
for ent, span in zip(entity_json, spans):
|
|
887
900
|
if span is not None:
|
|
888
901
|
start, end = span
|
|
@@ -1083,9 +1096,10 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1083
1096
|
sentences = self._get_sentences(text_content[document_key])
|
|
1084
1097
|
|
|
1085
1098
|
# generate initial outputs sentence by sentence
|
|
1086
|
-
tasks = []
|
|
1087
|
-
messages_list = []
|
|
1088
1099
|
for i in range(0, len(sentences), concurrent_batch_size):
|
|
1100
|
+
messages_list = []
|
|
1101
|
+
init_tasks = []
|
|
1102
|
+
review_tasks = []
|
|
1089
1103
|
batch = sentences[i:i + concurrent_batch_size]
|
|
1090
1104
|
for j, sent in enumerate(batch):
|
|
1091
1105
|
# construct chat messages
|
|
@@ -1116,24 +1130,21 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1116
1130
|
**kwrs
|
|
1117
1131
|
)
|
|
1118
1132
|
)
|
|
1119
|
-
|
|
1133
|
+
init_tasks.append(task)
|
|
1120
1134
|
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1135
|
+
# Wait until the batch is done, collect results and move on to next batch
|
|
1136
|
+
init_responses = await asyncio.gather(*init_tasks)
|
|
1137
|
+
# Collect initials
|
|
1138
|
+
initials = []
|
|
1139
|
+
for gen_text, sent, messages in zip(init_responses, batch, messages_list):
|
|
1140
|
+
initials.append({'sentence_start': sent['start'],
|
|
1141
|
+
'sentence_end': sent['end'],
|
|
1142
|
+
'sentence_text': sent['sentence_text'],
|
|
1143
|
+
'gen_text': gen_text,
|
|
1144
|
+
'messages': messages})
|
|
1131
1145
|
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
for i in range(0, len(initials), concurrent_batch_size):
|
|
1135
|
-
batch = initials[i:i + concurrent_batch_size]
|
|
1136
|
-
for init in batch:
|
|
1146
|
+
# Review
|
|
1147
|
+
for init in initials:
|
|
1137
1148
|
messages = init["messages"]
|
|
1138
1149
|
initial = init["gen_text"]
|
|
1139
1150
|
messages.append({'role': 'assistant', 'content': initial})
|
|
@@ -1146,29 +1157,29 @@ class SentenceReviewFrameExtractor(SentenceFrameExtractor):
|
|
|
1146
1157
|
**kwrs
|
|
1147
1158
|
)
|
|
1148
1159
|
)
|
|
1149
|
-
|
|
1160
|
+
review_tasks.append(task)
|
|
1150
1161
|
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1162
|
+
review_responses = await asyncio.gather(*review_tasks)
|
|
1163
|
+
|
|
1164
|
+
# Collect reviews
|
|
1165
|
+
reviews = []
|
|
1166
|
+
for gen_text, sent in zip(review_responses, batch):
|
|
1167
|
+
reviews.append({'sentence_start': sent['start'],
|
|
1168
|
+
'sentence_end': sent['end'],
|
|
1169
|
+
'sentence_text': sent['sentence_text'],
|
|
1170
|
+
'gen_text': gen_text})
|
|
1171
|
+
|
|
1172
|
+
for init, rev in zip(initials, reviews):
|
|
1173
|
+
if self.review_mode == "revision":
|
|
1174
|
+
gen_text = rev['gen_text']
|
|
1175
|
+
elif self.review_mode == "addition":
|
|
1176
|
+
gen_text = init['gen_text'] + '\n' + rev['gen_text']
|
|
1177
|
+
|
|
1178
|
+
# add to output
|
|
1179
|
+
output.append({'sentence_start': init['sentence_start'],
|
|
1180
|
+
'sentence_end': init['sentence_end'],
|
|
1181
|
+
'sentence_text': init['sentence_text'],
|
|
1182
|
+
'gen_text': gen_text})
|
|
1172
1183
|
return output
|
|
1173
1184
|
|
|
1174
1185
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llm-ie
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.6
|
|
4
4
|
Summary: An LLM-powered tool that transforms everyday language into robust information extraction pipelines.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Enshuo (David) Hsu
|
|
@@ -44,7 +44,7 @@ An LLM-powered tool that transforms everyday language into robust information ex
|
|
|
44
44
|
- [v0.4.5](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.5) (Feb 16, 2025):
|
|
45
45
|
- Added option to adjust number of context sentences in sentence-based extractors.
|
|
46
46
|
- Added support for OpenAI reasoning models ("o" series).
|
|
47
|
-
|
|
47
|
+
- [v0.4.6](https://github.com/daviden1013/llm-ie/releases/tag/v0.4.6) (Mar 1, 2025): Allow LLM to output overlapping frames.
|
|
48
48
|
|
|
49
49
|
## Table of Contents
|
|
50
50
|
- [Overview](#overview)
|
|
@@ -16,8 +16,8 @@ llm_ie/asset/prompt_guide/SentenceFrameExtractor_prompt_guide.txt,sha256=YomMhiA
|
|
|
16
16
|
llm_ie/asset/prompt_guide/SentenceReviewFrameExtractor_prompt_guide.txt,sha256=YomMhiA6BoBIJFOZ1jcoKRqY7_T3PGUsM0t0vd1IuZA,9325
|
|
17
17
|
llm_ie/data_types.py,sha256=_Kt4Er1SMj1jg8U8TCXFJH_64prur-IbFngHKmZgWr8,15717
|
|
18
18
|
llm_ie/engines.py,sha256=nWQzV7mcRCOkJ-U1iP-xrT9dVahVbj-nAjQci4XaRjY,22609
|
|
19
|
-
llm_ie/extractors.py,sha256=
|
|
19
|
+
llm_ie/extractors.py,sha256=xdg1aYN_My9N9HrzKuMniqFCzNCq7E3DKI7Ru9gawFs,89822
|
|
20
20
|
llm_ie/prompt_editor.py,sha256=pHRbg_yFZdoV63r3pvf0TsLfgH2EVJvzUQEVDH1Hj0s,9570
|
|
21
|
-
llm_ie-0.4.
|
|
22
|
-
llm_ie-0.4.
|
|
23
|
-
llm_ie-0.4.
|
|
21
|
+
llm_ie-0.4.6.dist-info/METADATA,sha256=wYKoOc0fvKvEAOvHGnMGOGgaKP49rmDUnHhA8vKBrgg,55745
|
|
22
|
+
llm_ie-0.4.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
23
|
+
llm_ie-0.4.6.dist-info/RECORD,,
|
|
File without changes
|