pragmatic_tokenizer 1.4.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +184 -0
- data/.rubocop_todo.yml +66 -0
- data/README.md +0 -7
- data/Rakefile +1 -1
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
- data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
- data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages.rb +28 -28
- data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
- data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
- data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -0
- data/spec/languages/bulgarian_spec.rb +17 -13
- data/spec/languages/deutsch_spec.rb +110 -86
- data/spec/languages/english_spec.rb +465 -342
- data/spec/languages/french_spec.rb +3 -2
- data/spec/performance_spec.rb +7 -7
- data/spec/pragmatic_tokenizer_spec.rb +8 -8
- metadata +18 -2
@@ -1,148 +1,178 @@
|
|
1
1
|
module PragmaticTokenizer
|
2
|
-
|
3
|
-
|
4
|
-
def
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
shift_ending_hyphen(text)
|
31
|
-
text.squeeze(' ')
|
2
|
+
module PreProcessor
|
3
|
+
|
4
|
+
def pre_process(language: Languages::Common)
|
5
|
+
shift_comma!
|
6
|
+
shift_multiple_dash!
|
7
|
+
shift_inverted_question_mark!
|
8
|
+
shift_inverted_exclamation!
|
9
|
+
shift_exclamation!
|
10
|
+
shift_ellipse_three_dots!
|
11
|
+
shift_ellipse_two_dots!
|
12
|
+
shift_horizontal_ellipsis!
|
13
|
+
shift_no_space_mention!
|
14
|
+
shift_not_equals!
|
15
|
+
shift_special_quotes!
|
16
|
+
shift_colon!
|
17
|
+
shift_bracket!
|
18
|
+
shift_semicolon!
|
19
|
+
shift_percent!
|
20
|
+
shift_caret!
|
21
|
+
shift_hashtag!
|
22
|
+
shift_ampersand!
|
23
|
+
shift_vertical_bar!
|
24
|
+
convert_dbl_quotes!
|
25
|
+
convert_sgl_quotes!(language)
|
26
|
+
convert_apostrophe_s!
|
27
|
+
shift_beginning_hyphen!
|
28
|
+
shift_ending_hyphen!
|
29
|
+
squeeze(' '.freeze)
|
32
30
|
end
|
33
31
|
|
34
32
|
private
|
35
33
|
|
36
|
-
def shift_comma(text)
|
37
34
|
# Shift commas off everything but numbers
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
def shift_comma!
|
36
|
+
gsub!(/,(?!\d)/o, ' , '.freeze)
|
37
|
+
gsub!(/(?<=\D),(?=\S+)/, ' , '.freeze)
|
38
|
+
end
|
41
39
|
|
42
|
-
|
43
|
-
|
44
|
-
|
40
|
+
def shift_multiple_dash!
|
41
|
+
gsub!(/--+/o, ' - '.freeze)
|
42
|
+
end
|
45
43
|
|
46
|
-
|
47
|
-
|
48
|
-
|
44
|
+
def shift_inverted_question_mark!
|
45
|
+
gsub!(/¿/, ' ¿ '.freeze)
|
46
|
+
end
|
49
47
|
|
50
|
-
|
51
|
-
|
52
|
-
|
48
|
+
def shift_inverted_exclamation!
|
49
|
+
gsub!(/¡/, ' ¡ '.freeze)
|
50
|
+
end
|
53
51
|
|
54
|
-
|
55
|
-
|
56
|
-
|
52
|
+
def shift_exclamation!
|
53
|
+
gsub!(/(?<=[a-zA-z])!(?=[a-zA-z])/, ' ! '.freeze)
|
54
|
+
end
|
57
55
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text
|
62
|
-
end
|
56
|
+
def shift_horizontal_ellipsis!
|
57
|
+
gsub!(/(…+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
|
58
|
+
end
|
63
59
|
|
64
|
-
|
65
|
-
|
66
|
-
|
60
|
+
def shift_ellipse_two_dots!
|
61
|
+
gsub!(/(\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
|
62
|
+
end
|
67
63
|
|
68
|
-
|
69
|
-
|
70
|
-
|
64
|
+
def shift_ellipse_three_dots!
|
65
|
+
gsub!(/(\.\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
|
66
|
+
end
|
71
67
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
text.gsub!(/„/, ' „ ') || text
|
76
|
-
text.gsub!(/“/, ' “ ') || text
|
77
|
-
end
|
68
|
+
def shift_no_space_mention!
|
69
|
+
gsub!(/\.(?=(@|@)[^\.]+(\s|\z))/, '. '.freeze)
|
70
|
+
end
|
78
71
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
text.partition(':').first[-1] !~ /\A\d+/)
|
83
|
-
# Ignore web addresses
|
84
|
-
text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
|
85
|
-
text.gsub!(/:/o, ' :') || text
|
86
|
-
text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
|
87
|
-
end
|
72
|
+
def shift_not_equals!
|
73
|
+
gsub!(/≠/, ' ≠ '.freeze)
|
74
|
+
end
|
88
75
|
|
89
|
-
|
90
|
-
|
91
|
-
|
76
|
+
def shift_special_quotes!
|
77
|
+
gsub!(/([«»„“])/, ' \1 ')
|
78
|
+
end
|
92
79
|
|
93
|
-
|
94
|
-
|
95
|
-
|
80
|
+
def shift_colon!
|
81
|
+
return unless may_shift_colon?
|
82
|
+
# Ignore web addresses
|
83
|
+
replacement = replacement_for_key(':'.freeze)
|
84
|
+
gsub!(%r{(?<=[(https?|ftp)]):(?=//)}, replacement)
|
85
|
+
gsub!(/:/o, ' :'.freeze)
|
86
|
+
gsub!(/(?<=\s):(?=\#)/, ': '.freeze)
|
87
|
+
end
|
96
88
|
|
97
|
-
|
98
|
-
|
99
|
-
|
89
|
+
def may_shift_colon?
|
90
|
+
return false unless include?(':'.freeze)
|
91
|
+
partitions = partition(':'.freeze)
|
92
|
+
partitions.last[0] !~ /\A\d+/ || partitions.first[-1] !~ /\A\d+/
|
93
|
+
end
|
100
94
|
|
101
|
-
|
102
|
-
|
103
|
-
|
95
|
+
def shift_bracket!
|
96
|
+
gsub!(/([\(\[\{\}\]\)])/o) { ' ' + Regexp.last_match(1) + ' '.freeze }
|
97
|
+
end
|
104
98
|
|
105
|
-
|
106
|
-
|
107
|
-
|
99
|
+
def shift_semicolon!
|
100
|
+
gsub!(/([;])/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
|
101
|
+
end
|
108
102
|
|
109
|
-
|
110
|
-
|
111
|
-
|
103
|
+
def shift_percent!
|
104
|
+
gsub!(/(?<=\D)%(?=\d+)/, ' %'.freeze)
|
105
|
+
end
|
112
106
|
|
113
|
-
|
114
|
-
|
115
|
-
|
107
|
+
def shift_caret!
|
108
|
+
gsub!(/\^/, ' ^ '.freeze)
|
109
|
+
end
|
116
110
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
121
|
-
text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
|
122
|
-
# Convert remaining quotes to special character
|
123
|
-
text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
124
|
-
text.gsub!(/''/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
125
|
-
text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
|
126
|
-
end
|
111
|
+
def shift_hashtag!
|
112
|
+
gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2')
|
113
|
+
end
|
127
114
|
|
128
|
-
|
129
|
-
|
130
|
-
@language::SingleQuotes.new.handle_single_quotes(text)
|
131
|
-
else
|
132
|
-
PragmaticTokenizer::Languages::Common::SingleQuotes.new.handle_single_quotes(text)
|
115
|
+
def shift_ampersand!
|
116
|
+
gsub!(/\&/, ' & '.freeze)
|
133
117
|
end
|
134
|
-
end
|
135
118
|
|
136
|
-
|
137
|
-
|
138
|
-
|
119
|
+
def shift_vertical_bar!
|
120
|
+
gsub!(/\|/, ' | '.freeze)
|
121
|
+
end
|
139
122
|
|
140
|
-
|
141
|
-
|
142
|
-
|
123
|
+
def convert_dbl_quotes!
|
124
|
+
replace_left_double_quotes!
|
125
|
+
replace_remaining_double_quotes!
|
126
|
+
end
|
127
|
+
|
128
|
+
def replace_left_double_quotes!
|
129
|
+
replace_left_quotes!("''", '"'.freeze)
|
130
|
+
replace_left_quotes!('"', '"'.freeze)
|
131
|
+
replace_left_quotes!('“', '“'.freeze)
|
132
|
+
end
|
133
|
+
|
134
|
+
def replace_left_quotes!(style, replacement_key)
|
135
|
+
replacement = replacement_for_key(replacement_key)
|
136
|
+
gsub!(/#{style}(?=.*\w)/o, ' '.freeze + replacement + ' '.freeze)
|
137
|
+
end
|
138
|
+
|
139
|
+
def replace_remaining_double_quotes!
|
140
|
+
replace_remaining_quotes!('"', '"'.freeze)
|
141
|
+
replace_remaining_quotes!("''", '"'.freeze)
|
142
|
+
replace_remaining_quotes!('”', '”'.freeze)
|
143
|
+
end
|
144
|
+
|
145
|
+
def replace_remaining_quotes!(style, replacement_key)
|
146
|
+
replacement = replacement_for_key(replacement_key)
|
147
|
+
gsub!(/#{style}/, ' '.freeze + replacement + ' '.freeze)
|
148
|
+
end
|
149
|
+
|
150
|
+
def convert_sgl_quotes!(language)
|
151
|
+
replace(if defined?(language::SingleQuotes)
|
152
|
+
language::SingleQuotes.new
|
153
|
+
.handle_single_quotes(self)
|
154
|
+
else
|
155
|
+
PragmaticTokenizer::Languages::Common::SingleQuotes.new
|
156
|
+
.handle_single_quotes(self)
|
157
|
+
end)
|
158
|
+
end
|
159
|
+
|
160
|
+
def convert_apostrophe_s!
|
161
|
+
replacement = replacement_for_key('`'.freeze)
|
162
|
+
gsub!(/\s\u{0301}(?=s(\s|\z))/, replacement)
|
163
|
+
end
|
164
|
+
|
165
|
+
def shift_beginning_hyphen!
|
166
|
+
gsub!(/\s+-/, ' - '.freeze)
|
167
|
+
end
|
168
|
+
|
169
|
+
def shift_ending_hyphen!
|
170
|
+
gsub!(/-\s+/, ' - '.freeze)
|
171
|
+
end
|
172
|
+
|
173
|
+
def replacement_for_key(replacement_key)
|
174
|
+
PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[replacement_key]
|
175
|
+
end
|
143
176
|
|
144
|
-
def shift_ending_hyphen(text)
|
145
|
-
text.gsub!(/-\s+/, ' - ') || text
|
146
|
-
end
|
147
177
|
end
|
148
178
|
end
|