pragmatic_tokenizer 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +184 -0
- data/.rubocop_todo.yml +66 -0
- data/README.md +0 -7
- data/Rakefile +1 -1
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
- data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
- data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages.rb +28 -28
- data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
- data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
- data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -0
- data/spec/languages/bulgarian_spec.rb +17 -13
- data/spec/languages/deutsch_spec.rb +110 -86
- data/spec/languages/english_spec.rb +465 -342
- data/spec/languages/french_spec.rb +3 -2
- data/spec/performance_spec.rb +7 -7
- data/spec/pragmatic_tokenizer_spec.rb +8 -8
- metadata +18 -2
@@ -1,148 +1,178 @@
|
|
1
1
|
module PragmaticTokenizer
|
2
|
-
|
3
|
-
|
4
|
-
def
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
shift_ending_hyphen(text)
|
31
|
-
text.squeeze(' ')
|
2
|
+
module PreProcessor
|
3
|
+
|
4
|
+
def pre_process(language: Languages::Common)
|
5
|
+
shift_comma!
|
6
|
+
shift_multiple_dash!
|
7
|
+
shift_inverted_question_mark!
|
8
|
+
shift_inverted_exclamation!
|
9
|
+
shift_exclamation!
|
10
|
+
shift_ellipse_three_dots!
|
11
|
+
shift_ellipse_two_dots!
|
12
|
+
shift_horizontal_ellipsis!
|
13
|
+
shift_no_space_mention!
|
14
|
+
shift_not_equals!
|
15
|
+
shift_special_quotes!
|
16
|
+
shift_colon!
|
17
|
+
shift_bracket!
|
18
|
+
shift_semicolon!
|
19
|
+
shift_percent!
|
20
|
+
shift_caret!
|
21
|
+
shift_hashtag!
|
22
|
+
shift_ampersand!
|
23
|
+
shift_vertical_bar!
|
24
|
+
convert_dbl_quotes!
|
25
|
+
convert_sgl_quotes!(language)
|
26
|
+
convert_apostrophe_s!
|
27
|
+
shift_beginning_hyphen!
|
28
|
+
shift_ending_hyphen!
|
29
|
+
squeeze(' '.freeze)
|
32
30
|
end
|
33
31
|
|
34
32
|
private
|
35
33
|
|
36
|
-
def shift_comma(text)
|
37
34
|
# Shift commas off everything but numbers
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
def shift_comma!
|
36
|
+
gsub!(/,(?!\d)/o, ' , '.freeze)
|
37
|
+
gsub!(/(?<=\D),(?=\S+)/, ' , '.freeze)
|
38
|
+
end
|
41
39
|
|
42
|
-
|
43
|
-
|
44
|
-
|
40
|
+
def shift_multiple_dash!
|
41
|
+
gsub!(/--+/o, ' - '.freeze)
|
42
|
+
end
|
45
43
|
|
46
|
-
|
47
|
-
|
48
|
-
|
44
|
+
def shift_inverted_question_mark!
|
45
|
+
gsub!(/¿/, ' ¿ '.freeze)
|
46
|
+
end
|
49
47
|
|
50
|
-
|
51
|
-
|
52
|
-
|
48
|
+
def shift_inverted_exclamation!
|
49
|
+
gsub!(/¡/, ' ¡ '.freeze)
|
50
|
+
end
|
53
51
|
|
54
|
-
|
55
|
-
|
56
|
-
|
52
|
+
def shift_exclamation!
|
53
|
+
gsub!(/(?<=[a-zA-z])!(?=[a-zA-z])/, ' ! '.freeze)
|
54
|
+
end
|
57
55
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text
|
62
|
-
end
|
56
|
+
def shift_horizontal_ellipsis!
|
57
|
+
gsub!(/(…+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
|
58
|
+
end
|
63
59
|
|
64
|
-
|
65
|
-
|
66
|
-
|
60
|
+
def shift_ellipse_two_dots!
|
61
|
+
gsub!(/(\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
|
62
|
+
end
|
67
63
|
|
68
|
-
|
69
|
-
|
70
|
-
|
64
|
+
def shift_ellipse_three_dots!
|
65
|
+
gsub!(/(\.\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
|
66
|
+
end
|
71
67
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
text.gsub!(/„/, ' „ ') || text
|
76
|
-
text.gsub!(/“/, ' “ ') || text
|
77
|
-
end
|
68
|
+
def shift_no_space_mention!
|
69
|
+
gsub!(/\.(?=(@|@)[^\.]+(\s|\z))/, '. '.freeze)
|
70
|
+
end
|
78
71
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
text.partition(':').first[-1] !~ /\A\d+/)
|
83
|
-
# Ignore web addresses
|
84
|
-
text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
|
85
|
-
text.gsub!(/:/o, ' :') || text
|
86
|
-
text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
|
87
|
-
end
|
72
|
+
def shift_not_equals!
|
73
|
+
gsub!(/≠/, ' ≠ '.freeze)
|
74
|
+
end
|
88
75
|
|
89
|
-
|
90
|
-
|
91
|
-
|
76
|
+
def shift_special_quotes!
|
77
|
+
gsub!(/([«»„“])/, ' \1 ')
|
78
|
+
end
|
92
79
|
|
93
|
-
|
94
|
-
|
95
|
-
|
80
|
+
def shift_colon!
|
81
|
+
return unless may_shift_colon?
|
82
|
+
# Ignore web addresses
|
83
|
+
replacement = replacement_for_key(':'.freeze)
|
84
|
+
gsub!(%r{(?<=[(https?|ftp)]):(?=//)}, replacement)
|
85
|
+
gsub!(/:/o, ' :'.freeze)
|
86
|
+
gsub!(/(?<=\s):(?=\#)/, ': '.freeze)
|
87
|
+
end
|
96
88
|
|
97
|
-
|
98
|
-
|
99
|
-
|
89
|
+
def may_shift_colon?
|
90
|
+
return false unless include?(':'.freeze)
|
91
|
+
partitions = partition(':'.freeze)
|
92
|
+
partitions.last[0] !~ /\A\d+/ || partitions.first[-1] !~ /\A\d+/
|
93
|
+
end
|
100
94
|
|
101
|
-
|
102
|
-
|
103
|
-
|
95
|
+
def shift_bracket!
|
96
|
+
gsub!(/([\(\[\{\}\]\)])/o) { ' ' + Regexp.last_match(1) + ' '.freeze }
|
97
|
+
end
|
104
98
|
|
105
|
-
|
106
|
-
|
107
|
-
|
99
|
+
def shift_semicolon!
|
100
|
+
gsub!(/([;])/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
|
101
|
+
end
|
108
102
|
|
109
|
-
|
110
|
-
|
111
|
-
|
103
|
+
def shift_percent!
|
104
|
+
gsub!(/(?<=\D)%(?=\d+)/, ' %'.freeze)
|
105
|
+
end
|
112
106
|
|
113
|
-
|
114
|
-
|
115
|
-
|
107
|
+
def shift_caret!
|
108
|
+
gsub!(/\^/, ' ^ '.freeze)
|
109
|
+
end
|
116
110
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
121
|
-
text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
|
122
|
-
# Convert remaining quotes to special character
|
123
|
-
text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
124
|
-
text.gsub!(/''/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
125
|
-
text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
|
126
|
-
end
|
111
|
+
def shift_hashtag!
|
112
|
+
gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2')
|
113
|
+
end
|
127
114
|
|
128
|
-
|
129
|
-
|
130
|
-
@language::SingleQuotes.new.handle_single_quotes(text)
|
131
|
-
else
|
132
|
-
PragmaticTokenizer::Languages::Common::SingleQuotes.new.handle_single_quotes(text)
|
115
|
+
def shift_ampersand!
|
116
|
+
gsub!(/\&/, ' & '.freeze)
|
133
117
|
end
|
134
|
-
end
|
135
118
|
|
136
|
-
|
137
|
-
|
138
|
-
|
119
|
+
def shift_vertical_bar!
|
120
|
+
gsub!(/\|/, ' | '.freeze)
|
121
|
+
end
|
139
122
|
|
140
|
-
|
141
|
-
|
142
|
-
|
123
|
+
def convert_dbl_quotes!
|
124
|
+
replace_left_double_quotes!
|
125
|
+
replace_remaining_double_quotes!
|
126
|
+
end
|
127
|
+
|
128
|
+
def replace_left_double_quotes!
|
129
|
+
replace_left_quotes!("''", '"'.freeze)
|
130
|
+
replace_left_quotes!('"', '"'.freeze)
|
131
|
+
replace_left_quotes!('“', '“'.freeze)
|
132
|
+
end
|
133
|
+
|
134
|
+
def replace_left_quotes!(style, replacement_key)
|
135
|
+
replacement = replacement_for_key(replacement_key)
|
136
|
+
gsub!(/#{style}(?=.*\w)/o, ' '.freeze + replacement + ' '.freeze)
|
137
|
+
end
|
138
|
+
|
139
|
+
def replace_remaining_double_quotes!
|
140
|
+
replace_remaining_quotes!('"', '"'.freeze)
|
141
|
+
replace_remaining_quotes!("''", '"'.freeze)
|
142
|
+
replace_remaining_quotes!('”', '”'.freeze)
|
143
|
+
end
|
144
|
+
|
145
|
+
def replace_remaining_quotes!(style, replacement_key)
|
146
|
+
replacement = replacement_for_key(replacement_key)
|
147
|
+
gsub!(/#{style}/, ' '.freeze + replacement + ' '.freeze)
|
148
|
+
end
|
149
|
+
|
150
|
+
def convert_sgl_quotes!(language)
|
151
|
+
replace(if defined?(language::SingleQuotes)
|
152
|
+
language::SingleQuotes.new
|
153
|
+
.handle_single_quotes(self)
|
154
|
+
else
|
155
|
+
PragmaticTokenizer::Languages::Common::SingleQuotes.new
|
156
|
+
.handle_single_quotes(self)
|
157
|
+
end)
|
158
|
+
end
|
159
|
+
|
160
|
+
def convert_apostrophe_s!
|
161
|
+
replacement = replacement_for_key('`'.freeze)
|
162
|
+
gsub!(/\s\u{0301}(?=s(\s|\z))/, replacement)
|
163
|
+
end
|
164
|
+
|
165
|
+
def shift_beginning_hyphen!
|
166
|
+
gsub!(/\s+-/, ' - '.freeze)
|
167
|
+
end
|
168
|
+
|
169
|
+
def shift_ending_hyphen!
|
170
|
+
gsub!(/-\s+/, ' - '.freeze)
|
171
|
+
end
|
172
|
+
|
173
|
+
def replacement_for_key(replacement_key)
|
174
|
+
PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[replacement_key]
|
175
|
+
end
|
143
176
|
|
144
|
-
def shift_ending_hyphen(text)
|
145
|
-
text.gsub!(/-\s+/, ' - ') || text
|
146
|
-
end
|
147
177
|
end
|
148
178
|
end
|