pragmatic_tokenizer 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +184 -0
  3. data/.rubocop_todo.yml +66 -0
  4. data/README.md +0 -7
  5. data/Rakefile +1 -1
  6. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
  7. data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
  8. data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
  9. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
  10. data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
  11. data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
  12. data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
  13. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
  15. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
  17. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
  19. data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  22. data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
  23. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
  27. data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
  28. data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
  29. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  30. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  31. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  32. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  33. data/lib/pragmatic_tokenizer/languages.rb +28 -28
  34. data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
  35. data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
  36. data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
  37. data/lib/pragmatic_tokenizer/version.rb +1 -1
  38. data/pragmatic_tokenizer.gemspec +1 -0
  39. data/spec/languages/bulgarian_spec.rb +17 -13
  40. data/spec/languages/deutsch_spec.rb +110 -86
  41. data/spec/languages/english_spec.rb +465 -342
  42. data/spec/languages/french_spec.rb +3 -2
  43. data/spec/performance_spec.rb +7 -7
  44. data/spec/pragmatic_tokenizer_spec.rb +8 -8
  45. metadata +18 -2
@@ -1,148 +1,178 @@
1
1
  module PragmaticTokenizer
2
- class PreProcessor
3
-
4
- def initialize(language: Languages::Common)
5
- @language = language
6
- end
7
-
8
- def pre_process(text:)
9
- shift_comma(text)
10
- shift_multiple_dash(text)
11
- shift_upsidedown_question_mark(text)
12
- shift_upsidedown_exclamation(text)
13
- shift_exclamation(text)
14
- shift_ellipse(text)
15
- shift_no_space_mention(text)
16
- shift_not_equals(text)
17
- shift_special_quotes(text)
18
- shift_colon(text)
19
- shift_bracket(text)
20
- shift_semicolon(text)
21
- shift_percent(text)
22
- shift_caret(text)
23
- shift_hashtag(text)
24
- shift_ampersand(text)
25
- shift_vertical_bar(text)
26
- convert_dbl_quotes(text)
27
- convert_sgl_quotes(text)
28
- convert_apostrophe_s(text)
29
- shift_beginning_hyphen(text)
30
- shift_ending_hyphen(text)
31
- text.squeeze(' ')
2
+ module PreProcessor
3
+
4
+ def pre_process(language: Languages::Common)
5
+ shift_comma!
6
+ shift_multiple_dash!
7
+ shift_inverted_question_mark!
8
+ shift_inverted_exclamation!
9
+ shift_exclamation!
10
+ shift_ellipse_three_dots!
11
+ shift_ellipse_two_dots!
12
+ shift_horizontal_ellipsis!
13
+ shift_no_space_mention!
14
+ shift_not_equals!
15
+ shift_special_quotes!
16
+ shift_colon!
17
+ shift_bracket!
18
+ shift_semicolon!
19
+ shift_percent!
20
+ shift_caret!
21
+ shift_hashtag!
22
+ shift_ampersand!
23
+ shift_vertical_bar!
24
+ convert_dbl_quotes!
25
+ convert_sgl_quotes!(language)
26
+ convert_apostrophe_s!
27
+ shift_beginning_hyphen!
28
+ shift_ending_hyphen!
29
+ squeeze(' '.freeze)
32
30
  end
33
31
 
34
32
  private
35
33
 
36
- def shift_comma(text)
37
34
  # Shift commas off everything but numbers
38
- text.gsub!(/,(?!\d)/o, ' , ') || text
39
- text.gsub!(/(?<=\D),(?=\S+)/, ' , ') || text
40
- end
35
+ def shift_comma!
36
+ gsub!(/,(?!\d)/o, ' , '.freeze)
37
+ gsub!(/(?<=\D),(?=\S+)/, ' , '.freeze)
38
+ end
41
39
 
42
- def shift_multiple_dash(text)
43
- text.gsub!(/--+/o, ' - ') || text
44
- end
40
+ def shift_multiple_dash!
41
+ gsub!(/--+/o, ' - '.freeze)
42
+ end
45
43
 
46
- def shift_upsidedown_question_mark(text)
47
- text.gsub!(/¿/, ' ¿ ') || text
48
- end
44
+ def shift_inverted_question_mark!
45
+ gsub!(/¿/, ' ¿ '.freeze)
46
+ end
49
47
 
50
- def shift_upsidedown_exclamation(text)
51
- text.gsub!(/¡/, ' ¡ ') || text
52
- end
48
+ def shift_inverted_exclamation!
49
+ gsub!(/¡/, ' ¡ '.freeze)
50
+ end
53
51
 
54
- def shift_exclamation(text)
55
- text.gsub!(/(?<=[a-zA-z])!(?=[a-zA-z])/, ' ! ') || text
56
- end
52
+ def shift_exclamation!
53
+ gsub!(/(?<=[a-zA-z])!(?=[a-zA-z])/, ' ! '.freeze)
54
+ end
57
55
 
58
- def shift_ellipse(text)
59
- text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
60
- text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
61
- text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text
62
- end
56
+ def shift_horizontal_ellipsis!
57
+ gsub!(/(…+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
58
+ end
63
59
 
64
- def shift_no_space_mention(text)
65
- text.gsub!(/\.(?=(@|@)[^\.]+(\s|\z))/, '. ') || text
66
- end
60
+ def shift_ellipse_two_dots!
61
+ gsub!(/(\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
62
+ end
67
63
 
68
- def shift_not_equals(text)
69
- text.gsub!(/≠/, ' ') || text
70
- end
64
+ def shift_ellipse_three_dots!
65
+ gsub!(/(\.\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
66
+ end
71
67
 
72
- def shift_special_quotes(text)
73
- text.gsub!(/«/, ' « ') || text
74
- text.gsub!(/»/, ' » ') || text
75
- text.gsub!(/„/, ' „ ') || text
76
- text.gsub!(/“/, ' “ ') || text
77
- end
68
+ def shift_no_space_mention!
69
+ gsub!(/\.(?=(@|@)[^\.]+(\s|\z))/, '. '.freeze)
70
+ end
78
71
 
79
- def shift_colon(text)
80
- return text unless text.include?(':') &&
81
- (text.partition(':').last[0] !~ /\A\d+/ ||
82
- text.partition(':').first[-1] !~ /\A\d+/)
83
- # Ignore web addresses
84
- text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
85
- text.gsub!(/:/o, ' :') || text
86
- text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
87
- end
72
+ def shift_not_equals!
73
+ gsub!(/≠/, ' '.freeze)
74
+ end
88
75
 
89
- def shift_bracket(text)
90
- text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
91
- end
76
+ def shift_special_quotes!
77
+ gsub!(/([«»„“])/, ' \1 ')
78
+ end
92
79
 
93
- def shift_semicolon(text)
94
- text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
95
- end
80
+ def shift_colon!
81
+ return unless may_shift_colon?
82
+ # Ignore web addresses
83
+ replacement = replacement_for_key(':'.freeze)
84
+ gsub!(%r{(?<=[(https?|ftp)]):(?=//)}, replacement)
85
+ gsub!(/:/o, ' :'.freeze)
86
+ gsub!(/(?<=\s):(?=\#)/, ': '.freeze)
87
+ end
96
88
 
97
- def shift_percent(text)
98
- text.gsub!(/(?<=\D)%(?=\d+)/, ' %') || text
99
- end
89
+ def may_shift_colon?
90
+ return false unless include?(':'.freeze)
91
+ partitions = partition(':'.freeze)
92
+ partitions.last[0] !~ /\A\d+/ || partitions.first[-1] !~ /\A\d+/
93
+ end
100
94
 
101
- def shift_caret(text)
102
- text.gsub!(/\^/, ' ^ ') || text
103
- end
95
+ def shift_bracket!
96
+ gsub!(/([\(\[\{\}\]\)])/o) { ' ' + Regexp.last_match(1) + ' '.freeze }
97
+ end
104
98
 
105
- def shift_hashtag(text)
106
- text.gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2') || text
107
- end
99
+ def shift_semicolon!
100
+ gsub!(/([;])/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
101
+ end
108
102
 
109
- def shift_ampersand(text)
110
- text.gsub!(/\&/, ' & ') || text
111
- end
103
+ def shift_percent!
104
+ gsub!(/(?<=\D)%(?=\d+)/, ' %'.freeze)
105
+ end
112
106
 
113
- def shift_vertical_bar(text)
114
- text.gsub!(/\|/, ' | ') || text
115
- end
107
+ def shift_caret!
108
+ gsub!(/\^/, ' ^ '.freeze)
109
+ end
116
110
 
117
- def convert_dbl_quotes(text)
118
- # Convert left double quotes to special character
119
- text.gsub!(/''(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
120
- text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
121
- text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
122
- # Convert remaining quotes to special character
123
- text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
124
- text.gsub!(/''/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
125
- text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
126
- end
111
+ def shift_hashtag!
112
+ gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2')
113
+ end
127
114
 
128
- def convert_sgl_quotes(text)
129
- if defined? @language::SingleQuotes
130
- @language::SingleQuotes.new.handle_single_quotes(text)
131
- else
132
- PragmaticTokenizer::Languages::Common::SingleQuotes.new.handle_single_quotes(text)
115
+ def shift_ampersand!
116
+ gsub!(/\&/, ' & '.freeze)
133
117
  end
134
- end
135
118
 
136
- def convert_apostrophe_s(text)
137
- text.gsub!(/\s\u{0301}(?=s(\s|\z))/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['`']) || text
138
- end
119
+ def shift_vertical_bar!
120
+ gsub!(/\|/, ' | '.freeze)
121
+ end
139
122
 
140
- def shift_beginning_hyphen(text)
141
- text.gsub!(/\s+-/, ' - ') || text
142
- end
123
+ def convert_dbl_quotes!
124
+ replace_left_double_quotes!
125
+ replace_remaining_double_quotes!
126
+ end
127
+
128
+ def replace_left_double_quotes!
129
+ replace_left_quotes!("''", '"'.freeze)
130
+ replace_left_quotes!('"', '"'.freeze)
131
+ replace_left_quotes!('“', '“'.freeze)
132
+ end
133
+
134
+ def replace_left_quotes!(style, replacement_key)
135
+ replacement = replacement_for_key(replacement_key)
136
+ gsub!(/#{style}(?=.*\w)/o, ' '.freeze + replacement + ' '.freeze)
137
+ end
138
+
139
+ def replace_remaining_double_quotes!
140
+ replace_remaining_quotes!('"', '"'.freeze)
141
+ replace_remaining_quotes!("''", '"'.freeze)
142
+ replace_remaining_quotes!('”', '”'.freeze)
143
+ end
144
+
145
+ def replace_remaining_quotes!(style, replacement_key)
146
+ replacement = replacement_for_key(replacement_key)
147
+ gsub!(/#{style}/, ' '.freeze + replacement + ' '.freeze)
148
+ end
149
+
150
+ def convert_sgl_quotes!(language)
151
+ replace(if defined?(language::SingleQuotes)
152
+ language::SingleQuotes.new
153
+ .handle_single_quotes(self)
154
+ else
155
+ PragmaticTokenizer::Languages::Common::SingleQuotes.new
156
+ .handle_single_quotes(self)
157
+ end)
158
+ end
159
+
160
+ def convert_apostrophe_s!
161
+ replacement = replacement_for_key('`'.freeze)
162
+ gsub!(/\s\u{0301}(?=s(\s|\z))/, replacement)
163
+ end
164
+
165
+ def shift_beginning_hyphen!
166
+ gsub!(/\s+-/, ' - '.freeze)
167
+ end
168
+
169
+ def shift_ending_hyphen!
170
+ gsub!(/-\s+/, ' - '.freeze)
171
+ end
172
+
173
+ def replacement_for_key(replacement_key)
174
+ PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[replacement_key]
175
+ end
143
176
 
144
- def shift_ending_hyphen(text)
145
- text.gsub!(/-\s+/, ' - ') || text
146
- end
147
177
  end
148
178
  end