pysbd-plus 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/__init__.py +0 -0
- benchmarks/benchmark_sbd_tools.py +84 -0
- benchmarks/bigtext_speed_benchmark.py +75 -0
- benchmarks/english_golden_rules.py +210 -0
- benchmarks/genia_benchmark.py +100 -0
- benchmarks/indonesian_golden_rules.py +247 -0
- benchmarks/latency_en_vs_id.py +70 -0
- pysbd/__init__.py +2 -0
- pysbd/abbreviation_replacer.py +112 -0
- pysbd/about.py +10 -0
- pysbd/between_punctuation.py +94 -0
- pysbd/clean/__init__.py +0 -0
- pysbd/clean/rules.py +80 -0
- pysbd/cleaner.py +111 -0
- pysbd/exclamation_words.py +17 -0
- pysbd/lang/__init__.py +0 -0
- pysbd/lang/amharic.py +13 -0
- pysbd/lang/arabic.py +35 -0
- pysbd/lang/armenian.py +13 -0
- pysbd/lang/bulgarian.py +24 -0
- pysbd/lang/burmese.py +13 -0
- pysbd/lang/chinese.py +36 -0
- pysbd/lang/common/__init__.py +2 -0
- pysbd/lang/common/common.py +91 -0
- pysbd/lang/common/standard.py +113 -0
- pysbd/lang/danish.py +40 -0
- pysbd/lang/deutsch.py +97 -0
- pysbd/lang/dutch.py +12 -0
- pysbd/lang/english.py +11 -0
- pysbd/lang/french.py +15 -0
- pysbd/lang/greek.py +13 -0
- pysbd/lang/hindi.py +13 -0
- pysbd/lang/indonesian.py +60 -0
- pysbd/lang/italian.py +15 -0
- pysbd/lang/japanese.py +51 -0
- pysbd/lang/kazakh.py +50 -0
- pysbd/lang/marathi.py +14 -0
- pysbd/lang/persian.py +30 -0
- pysbd/lang/polish.py +15 -0
- pysbd/lang/russian.py +27 -0
- pysbd/lang/slovak.py +111 -0
- pysbd/lang/spanish.py +15 -0
- pysbd/lang/urdu.py +13 -0
- pysbd/languages.py +66 -0
- pysbd/lists_item_replacer.py +240 -0
- pysbd/processor.py +204 -0
- pysbd/punctuation_replacer.py +44 -0
- pysbd/segmenter.py +96 -0
- pysbd/utils.py +81 -0
- pysbd_plus-0.3.5.dist-info/METADATA +139 -0
- pysbd_plus-0.3.5.dist-info/RECORD +55 -0
- pysbd_plus-0.3.5.dist-info/WHEEL +5 -0
- pysbd_plus-0.3.5.dist-info/entry_points.txt +2 -0
- pysbd_plus-0.3.5.dist-info/licenses/LICENSE +21 -0
- pysbd_plus-0.3.5.dist-info/top_level.txt +2 -0
benchmarks/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import blingfire
|
|
2
|
+
import nltk
|
|
3
|
+
import pysbd
|
|
4
|
+
import spacy
|
|
5
|
+
import stanza
|
|
6
|
+
|
|
7
|
+
from syntok.tokenizer import Tokenizer
|
|
8
|
+
import syntok.segmenter as syntok_segmenter
|
|
9
|
+
|
|
10
|
+
from english_golden_rules import GOLDEN_EN_RULES
|
|
11
|
+
|
|
12
|
+
pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
|
|
13
|
+
|
|
14
|
+
nlp = spacy.blank('en')
|
|
15
|
+
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
16
|
+
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
|
|
17
|
+
#stanza.download('en')
|
|
18
|
+
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
|
|
19
|
+
|
|
20
|
+
syntok_tokenizer = Tokenizer()
|
|
21
|
+
|
|
22
|
+
def blingfire_tokenize(text):
|
|
23
|
+
return blingfire.text_to_sentences(text).split('\n')
|
|
24
|
+
|
|
25
|
+
def nltk_tokenize(text):
|
|
26
|
+
return nltk.sent_tokenize(text)
|
|
27
|
+
|
|
28
|
+
def pysbd_tokenize(text):
|
|
29
|
+
segments = pysbd_segmenter.segment(text)
|
|
30
|
+
return [s.strip() for s in segments]
|
|
31
|
+
|
|
32
|
+
def spacy_tokenize(text):
|
|
33
|
+
return [sent.text for sent in nlp(text).sents]
|
|
34
|
+
|
|
35
|
+
def spacy_dep_tokenize(text):
|
|
36
|
+
return [sent.text for sent in nlp_dep(text).sents]
|
|
37
|
+
|
|
38
|
+
def stanza_tokenize(text):
|
|
39
|
+
return [e.text for e in stanza_nlp(text).sentences]
|
|
40
|
+
|
|
41
|
+
def make_sentences(segmented_tokens):
|
|
42
|
+
for sentence in segmented_tokens:
|
|
43
|
+
yield "".join(str(token) for token in sentence).strip()
|
|
44
|
+
|
|
45
|
+
def syntok_tokenize(text):
|
|
46
|
+
tokens = syntok_tokenizer.split(text)
|
|
47
|
+
result = syntok_segmenter.split(iter(tokens))
|
|
48
|
+
segments = [sent for sent in make_sentences(result)]
|
|
49
|
+
return segments
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
total_rules = len(GOLDEN_EN_RULES)
|
|
53
|
+
|
|
54
|
+
def benchmark(golden_rules, tokenize_func):
|
|
55
|
+
score = 0
|
|
56
|
+
for rule in golden_rules:
|
|
57
|
+
text, expected = rule
|
|
58
|
+
segments = tokenize_func(text)
|
|
59
|
+
if segments == expected:
|
|
60
|
+
score += 1
|
|
61
|
+
percent_score = (score / total_rules) * 100.0
|
|
62
|
+
|
|
63
|
+
return percent_score
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
import time
|
|
67
|
+
libraries = (
|
|
68
|
+
blingfire_tokenize,
|
|
69
|
+
nltk_tokenize,
|
|
70
|
+
pysbd_tokenize,
|
|
71
|
+
spacy_tokenize,
|
|
72
|
+
spacy_dep_tokenize,
|
|
73
|
+
stanza_tokenize,
|
|
74
|
+
syntok_tokenize)
|
|
75
|
+
for tokenize_func in libraries:
|
|
76
|
+
t = time.time()
|
|
77
|
+
for i in range(100):
|
|
78
|
+
percent_score = benchmark(GOLDEN_EN_RULES, tokenize_func)
|
|
79
|
+
|
|
80
|
+
time_taken = time.time() - t
|
|
81
|
+
print()
|
|
82
|
+
print(tokenize_func.__name__)
|
|
83
|
+
print('GRS score: {:0.2f}%'.format(percent_score))
|
|
84
|
+
print('Speed(Avg over 100 runs): {:>10.2f} ms'.format(time_taken*1000/100))
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import blingfire
|
|
2
|
+
import nltk
|
|
3
|
+
import pysbd
|
|
4
|
+
import spacy
|
|
5
|
+
import stanza
|
|
6
|
+
|
|
7
|
+
from syntok.tokenizer import Tokenizer
|
|
8
|
+
import syntok.segmenter as syntok_segmenter
|
|
9
|
+
|
|
10
|
+
pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
|
|
11
|
+
|
|
12
|
+
nlp = spacy.blank('en')
|
|
13
|
+
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
14
|
+
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
|
|
15
|
+
#stanza.download('en')
|
|
16
|
+
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
|
|
17
|
+
|
|
18
|
+
syntok_tokenizer = Tokenizer()
|
|
19
|
+
|
|
20
|
+
def blingfire_tokenize(text):
|
|
21
|
+
return blingfire.text_to_sentences(text).split('\n')
|
|
22
|
+
|
|
23
|
+
def nltk_tokenize(text):
|
|
24
|
+
return nltk.sent_tokenize(text)
|
|
25
|
+
|
|
26
|
+
def pysbd_tokenize(text):
|
|
27
|
+
segments = pysbd_segmenter.segment(text)
|
|
28
|
+
segments = [s.strip() for s in segments]
|
|
29
|
+
return segments
|
|
30
|
+
|
|
31
|
+
def spacy_tokenize(text):
|
|
32
|
+
return [sent.text.strip("\n") for sent in nlp(text).sents]
|
|
33
|
+
|
|
34
|
+
def spacy_dep_tokenize(text):
|
|
35
|
+
return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
|
|
36
|
+
|
|
37
|
+
def stanza_tokenize(text):
|
|
38
|
+
return [e.text for e in stanza_nlp(text).sentences]
|
|
39
|
+
|
|
40
|
+
def make_sentences(segmented_tokens):
|
|
41
|
+
for sentence in segmented_tokens:
|
|
42
|
+
yield "".join(str(token) for token in sentence).strip()
|
|
43
|
+
|
|
44
|
+
def syntok_tokenize(text):
|
|
45
|
+
tokens = syntok_tokenizer.split(text)
|
|
46
|
+
result = syntok_segmenter.split(iter(tokens))
|
|
47
|
+
segments = [sent for sent in make_sentences(result)]
|
|
48
|
+
return segments
|
|
49
|
+
|
|
50
|
+
def speed_benchmark(big_text, tokenize_func):
|
|
51
|
+
segments = tokenize_func(big_text)
|
|
52
|
+
return segments
|
|
53
|
+
|
|
54
|
+
if __name__ == "__main__":
|
|
55
|
+
import time
|
|
56
|
+
libraries = (
|
|
57
|
+
blingfire_tokenize,
|
|
58
|
+
nltk_tokenize,
|
|
59
|
+
pysbd_tokenize,
|
|
60
|
+
spacy_tokenize,
|
|
61
|
+
spacy_dep_tokenize,
|
|
62
|
+
stanza_tokenize,
|
|
63
|
+
syntok_tokenize)
|
|
64
|
+
|
|
65
|
+
for tokenize_func in libraries:
|
|
66
|
+
t = time.time()
|
|
67
|
+
# wget http://www.gutenberg.org/files/1661/1661-0.txt -P benchmarks/
|
|
68
|
+
with open('benchmarks/1661-0.txt') as bigfile:
|
|
69
|
+
big_text = bigfile.read()
|
|
70
|
+
sentences = speed_benchmark(big_text, tokenize_func)
|
|
71
|
+
|
|
72
|
+
time_taken = time.time() - t
|
|
73
|
+
print()
|
|
74
|
+
print(tokenize_func.__name__)
|
|
75
|
+
print('Speed : {:>20.2f} ms'.format(time_taken * 1000))
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
GOLDEN_EN_RULES = [
|
|
4
|
+
# 1) Simple period to end sentence
|
|
5
|
+
("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
|
|
6
|
+
# 2) Question mark to end sentence
|
|
7
|
+
("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
|
|
8
|
+
# 3) Exclamation point to end sentence
|
|
9
|
+
("There it is! I found it.", ["There it is!", "I found it."]),
|
|
10
|
+
# 4) One letter upper case abbreviations
|
|
11
|
+
("My name is Jonas E. Smith.", ["My name is Jonas E. Smith."]),
|
|
12
|
+
# 5) One letter lower case abbreviations
|
|
13
|
+
("Please turn to p. 55.", ["Please turn to p. 55."]),
|
|
14
|
+
# 6) Two letter lower case abbreviations in the middle of a sentence
|
|
15
|
+
("Were Jane and co. at the party?", ["Were Jane and co. at the party?"]),
|
|
16
|
+
# 7) Two letter upper case abbreviations in the middle of a sentence
|
|
17
|
+
("They closed the deal with Pitt, Briggs & Co. at noon.",
|
|
18
|
+
["They closed the deal with Pitt, Briggs & Co. at noon."]),
|
|
19
|
+
# 8) Two letter lower case abbreviations at the end of a sentence
|
|
20
|
+
(
|
|
21
|
+
"Let's ask Jane and co. They should know.",
|
|
22
|
+
["Let's ask Jane and co.", "They should know."]),
|
|
23
|
+
# 9) Two letter upper case abbreviations at the end of a sentence
|
|
24
|
+
(
|
|
25
|
+
"They closed the deal with Pitt, Briggs & Co. It closed yesterday.", [
|
|
26
|
+
"They closed the deal with Pitt, Briggs & Co.",
|
|
27
|
+
"It closed yesterday."
|
|
28
|
+
],
|
|
29
|
+
),
|
|
30
|
+
# 10) Two letter (prepositive) abbreviations
|
|
31
|
+
("I can see Mt. Fuji from here.", ["I can see Mt. Fuji from here."]),
|
|
32
|
+
# 11) Two letter (prepositive & postpositive) abbreviations
|
|
33
|
+
(
|
|
34
|
+
"St. Michael's Church is on 5th st. near the light.",
|
|
35
|
+
["St. Michael's Church is on 5th st. near the light."],
|
|
36
|
+
),
|
|
37
|
+
# 12) Possesive two letter abbreviations
|
|
38
|
+
("That is JFK Jr.'s book.", ["That is JFK Jr.'s book."]),
|
|
39
|
+
# 13) Multi-period abbreviations in the middle of a sentence
|
|
40
|
+
("I visited the U.S.A. last year.", ["I visited the U.S.A. last year."]),
|
|
41
|
+
# 14) Multi-period abbreviations at the end of a sentence
|
|
42
|
+
(
|
|
43
|
+
"I live in the E.U. How about you?",
|
|
44
|
+
["I live in the E.U.", "How about you?"],
|
|
45
|
+
),
|
|
46
|
+
# 15) U.S. as sentence boundary
|
|
47
|
+
(
|
|
48
|
+
"I live in the U.S. How about you?",
|
|
49
|
+
["I live in the U.S.", "How about you?"],
|
|
50
|
+
),
|
|
51
|
+
# 16) U.S. as non sentence boundary with next word capitalized
|
|
52
|
+
("I work for the U.S. Government in Virginia.",
|
|
53
|
+
["I work for the U.S. Government in Virginia."]),
|
|
54
|
+
# 17) U.S. as non sentence boundary
|
|
55
|
+
("I have lived in the U.S. for 20 years.",
|
|
56
|
+
["I have lived in the U.S. for 20 years."]),
|
|
57
|
+
# Most difficult sentence to crack
|
|
58
|
+
# 18) A.M. / P.M. as non sentence boundary and sentence boundary
|
|
59
|
+
(
|
|
60
|
+
"At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.",
|
|
61
|
+
[
|
|
62
|
+
"At 5 a.m. Mr. Smith went to the bank.",
|
|
63
|
+
"He left the bank at 6 P.M.", "Mr. Smith then went to the store."
|
|
64
|
+
]
|
|
65
|
+
),
|
|
66
|
+
# 19) Number as non sentence boundary
|
|
67
|
+
("She has $100.00 in her bag.", ["She has $100.00 in her bag."]),
|
|
68
|
+
# 20) Number as sentence boundary
|
|
69
|
+
("She has $100.00. It is in her bag.", ["She has $100.00.", "It is in her bag."]),
|
|
70
|
+
# 21) Parenthetical inside sentence
|
|
71
|
+
("He teaches science (He previously worked for 5 years as an engineer.) at the local University.",
|
|
72
|
+
["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]),
|
|
73
|
+
# 22) Email addresses
|
|
74
|
+
("Her email is Jane.Doe@example.com. I sent her an email.",
|
|
75
|
+
["Her email is Jane.Doe@example.com.", "I sent her an email."]),
|
|
76
|
+
# 23) Web addresses
|
|
77
|
+
("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.",
|
|
78
|
+
["The site is: https://www.example.50.com/new-site/awesome_content.html.",
|
|
79
|
+
"Please check it out."]),
|
|
80
|
+
# 24) Single quotations inside sentence
|
|
81
|
+
(
|
|
82
|
+
"She turned to him, 'This is great.' she said.",
|
|
83
|
+
["She turned to him, 'This is great.' she said."],
|
|
84
|
+
),
|
|
85
|
+
# 25) Double quotations inside sentence
|
|
86
|
+
(
|
|
87
|
+
'She turned to him, "This is great." she said.',
|
|
88
|
+
['She turned to him, "This is great." she said.'],
|
|
89
|
+
),
|
|
90
|
+
# 26) Double quotations at the end of a sentence
|
|
91
|
+
(
|
|
92
|
+
'She turned to him, "This is great." She held the book out to show him.',
|
|
93
|
+
[
|
|
94
|
+
'She turned to him, "This is great."',
|
|
95
|
+
"She held the book out to show him."
|
|
96
|
+
],
|
|
97
|
+
),
|
|
98
|
+
# 27) Double punctuation (exclamation point)
|
|
99
|
+
("Hello!! Long time no see.", ["Hello!!", "Long time no see."]),
|
|
100
|
+
# 28) Double punctuation (question mark)
|
|
101
|
+
("Hello?? Who is there?", ["Hello??", "Who is there?"]),
|
|
102
|
+
# 29) Double punctuation (exclamation point / question mark)
|
|
103
|
+
("Hello!? Is that you?", ["Hello!?", "Is that you?"]),
|
|
104
|
+
# 30) Double punctuation (question mark / exclamation point)
|
|
105
|
+
("Hello?! Is that you?", ["Hello?!", "Is that you?"]),
|
|
106
|
+
# 31) List (period followed by parens and no period to end item)
|
|
107
|
+
(
|
|
108
|
+
"1.) The first item 2.) The second item",
|
|
109
|
+
["1.) The first item", "2.) The second item"],
|
|
110
|
+
),
|
|
111
|
+
# 32) List (period followed by parens and period to end item)
|
|
112
|
+
(
|
|
113
|
+
"1.) The first item. 2.) The second item.",
|
|
114
|
+
["1.) The first item.", "2.) The second item."],
|
|
115
|
+
),
|
|
116
|
+
# 33) List (parens and no period to end item)
|
|
117
|
+
(
|
|
118
|
+
"1) The first item 2) The second item",
|
|
119
|
+
["1) The first item", "2) The second item"],
|
|
120
|
+
),
|
|
121
|
+
# 34) List (parens and period to end item)
|
|
122
|
+
("1) The first item. 2) The second item.",
|
|
123
|
+
["1) The first item.", "2) The second item."]),
|
|
124
|
+
# 35) List (period to mark list and no period to end item)
|
|
125
|
+
(
|
|
126
|
+
"1. The first item 2. The second item",
|
|
127
|
+
["1. The first item", "2. The second item"],
|
|
128
|
+
),
|
|
129
|
+
# 36) List (period to mark list and period to end item)
|
|
130
|
+
(
|
|
131
|
+
"1. The first item. 2. The second item.",
|
|
132
|
+
["1. The first item.", "2. The second item."],
|
|
133
|
+
),
|
|
134
|
+
# 37) List with bullet
|
|
135
|
+
(
|
|
136
|
+
"• 9. The first item • 10. The second item",
|
|
137
|
+
["• 9. The first item", "• 10. The second item"],
|
|
138
|
+
),
|
|
139
|
+
# 38) List with hypthen
|
|
140
|
+
(
|
|
141
|
+
"⁃9. The first item ⁃10. The second item",
|
|
142
|
+
["⁃9. The first item", "⁃10. The second item"],
|
|
143
|
+
),
|
|
144
|
+
# 39) Alphabetical list
|
|
145
|
+
(
|
|
146
|
+
"a. The first item b. The second item c. The third list item",
|
|
147
|
+
["a. The first item", "b. The second item", "c. The third list item"],
|
|
148
|
+
),
|
|
149
|
+
# 40) Geo Coordinates
|
|
150
|
+
(
|
|
151
|
+
"You can find it at N°. 1026.253.553. That is where the treasure is.",
|
|
152
|
+
[
|
|
153
|
+
"You can find it at N°. 1026.253.553.",
|
|
154
|
+
"That is where the treasure is."
|
|
155
|
+
],
|
|
156
|
+
),
|
|
157
|
+
# 41) Named entities with an exclamation point
|
|
158
|
+
(
|
|
159
|
+
"She works at Yahoo! in the accounting department.",
|
|
160
|
+
["She works at Yahoo! in the accounting department."],
|
|
161
|
+
),
|
|
162
|
+
# 42) I as a sentence boundary and I as an abbreviation
|
|
163
|
+
(
|
|
164
|
+
"We make a good team, you and I. Did you see Albert I. Jones yesterday?",
|
|
165
|
+
[
|
|
166
|
+
"We make a good team, you and I.",
|
|
167
|
+
"Did you see Albert I. Jones yesterday?"
|
|
168
|
+
],
|
|
169
|
+
),
|
|
170
|
+
# 43) Ellipsis at end of quotation
|
|
171
|
+
(
|
|
172
|
+
"Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”",
|
|
173
|
+
[
|
|
174
|
+
"Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"
|
|
175
|
+
],
|
|
176
|
+
),
|
|
177
|
+
# 44) Ellipsis with square brackets
|
|
178
|
+
(
|
|
179
|
+
""""Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).""",
|
|
180
|
+
[
|
|
181
|
+
'"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).'
|
|
182
|
+
],
|
|
183
|
+
),
|
|
184
|
+
# 45) Ellipsis as sentence boundary (standard ellipsis rules)
|
|
185
|
+
("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.",
|
|
186
|
+
[
|
|
187
|
+
"If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .",
|
|
188
|
+
"Next sentence."
|
|
189
|
+
]),
|
|
190
|
+
# 46) Ellipsis as sentence boundary (non-standard ellipsis rules)
|
|
191
|
+
(
|
|
192
|
+
"I never meant that.... She left the store.",
|
|
193
|
+
["I never meant that....", "She left the store."],
|
|
194
|
+
),
|
|
195
|
+
# 47) Ellipsis as non sentence boundary
|
|
196
|
+
(
|
|
197
|
+
"I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.",
|
|
198
|
+
[
|
|
199
|
+
"I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."
|
|
200
|
+
],
|
|
201
|
+
),
|
|
202
|
+
# 48) 4-dot ellipsis
|
|
203
|
+
(
|
|
204
|
+
"One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .",
|
|
205
|
+
[
|
|
206
|
+
"One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.",
|
|
207
|
+
". . . The practice was not abandoned. . . ."
|
|
208
|
+
],
|
|
209
|
+
)
|
|
210
|
+
]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import blingfire
|
|
2
|
+
import nltk
|
|
3
|
+
import pysbd
|
|
4
|
+
import spacy
|
|
5
|
+
import stanza
|
|
6
|
+
|
|
7
|
+
from syntok.tokenizer import Tokenizer
|
|
8
|
+
import syntok.segmenter as syntok_segmenter
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
|
|
13
|
+
|
|
14
|
+
nlp = spacy.blank('en')
|
|
15
|
+
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
16
|
+
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
|
|
17
|
+
#stanza.download('en')
|
|
18
|
+
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
|
|
19
|
+
|
|
20
|
+
syntok_tokenizer = Tokenizer()
|
|
21
|
+
|
|
22
|
+
def blingfire_tokenize(text):
|
|
23
|
+
return blingfire.text_to_sentences(text).split('\n')
|
|
24
|
+
|
|
25
|
+
def nltk_tokenize(text):
|
|
26
|
+
return nltk.sent_tokenize(text)
|
|
27
|
+
|
|
28
|
+
def pysbd_tokenize(text):
|
|
29
|
+
segments = pysbd_segmenter.segment(text)
|
|
30
|
+
return [s.strip() for s in segments]
|
|
31
|
+
|
|
32
|
+
def spacy_tokenize(text):
|
|
33
|
+
return [sent.text.strip("\n") for sent in nlp(text).sents]
|
|
34
|
+
|
|
35
|
+
def spacy_dep_tokenize(text):
|
|
36
|
+
return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
|
|
37
|
+
|
|
38
|
+
def stanza_tokenize(text):
|
|
39
|
+
return [e.text for e in stanza_nlp(text).sentences]
|
|
40
|
+
|
|
41
|
+
def make_sentences(segmented_tokens):
|
|
42
|
+
for sentence in segmented_tokens:
|
|
43
|
+
yield "".join(str(token) for token in sentence).strip()
|
|
44
|
+
|
|
45
|
+
def syntok_tokenize(text):
|
|
46
|
+
tokens = syntok_tokenizer.split(text)
|
|
47
|
+
result = syntok_segmenter.split(iter(tokens))
|
|
48
|
+
segments = [sent for sent in make_sentences(result)]
|
|
49
|
+
return segments
|
|
50
|
+
|
|
51
|
+
def load_genia_corpus(genia_raw_dir):
|
|
52
|
+
txtfiles = Path(genia_raw_dir).glob("**/*.txt")
|
|
53
|
+
txtfiles = list(txtfiles)
|
|
54
|
+
all_docs = []
|
|
55
|
+
for ind, txtfile in enumerate(txtfiles, start=1):
|
|
56
|
+
with open(txtfile) as f:
|
|
57
|
+
geniatext = f.read().strip()
|
|
58
|
+
expected = geniatext.split('\n')
|
|
59
|
+
all_docs.append((geniatext, expected))
|
|
60
|
+
|
|
61
|
+
return all_docs
|
|
62
|
+
|
|
63
|
+
def benchmark(docs, tokenize_func):
|
|
64
|
+
|
|
65
|
+
correct = 0
|
|
66
|
+
for (text, expected) in docs:
|
|
67
|
+
segments = tokenize_func(text)
|
|
68
|
+
if segments == expected:
|
|
69
|
+
correct +=1
|
|
70
|
+
return correct
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
if __name__ == "__main__":
|
|
74
|
+
import argparse
|
|
75
|
+
parser = argparse.ArgumentParser()
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
'--genia',
|
|
78
|
+
help="Path to the directory containing genia data."
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
args = parser.parse_args()
|
|
82
|
+
|
|
83
|
+
libraries = (
|
|
84
|
+
blingfire_tokenize,
|
|
85
|
+
nltk_tokenize,
|
|
86
|
+
pysbd_tokenize,
|
|
87
|
+
spacy_tokenize,
|
|
88
|
+
spacy_dep_tokenize,
|
|
89
|
+
stanza_tokenize,
|
|
90
|
+
syntok_tokenize
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
docs = load_genia_corpus(args.genia)
|
|
94
|
+
total = len(docs)
|
|
95
|
+
for tokenize_func in libraries:
|
|
96
|
+
correct = benchmark(docs, tokenize_func)
|
|
97
|
+
percent_score = correct/total * 100
|
|
98
|
+
print()
|
|
99
|
+
print(tokenize_func.__name__)
|
|
100
|
+
print('GENIA abstract acc: {:0.2f}%'.format(percent_score))
|