pragmatic_tokenizer 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +184 -0
  3. data/.rubocop_todo.yml +66 -0
  4. data/README.md +0 -7
  5. data/Rakefile +1 -1
  6. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
  7. data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
  8. data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
  9. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
  10. data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
  11. data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
  12. data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
  13. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
  15. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
  17. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
  19. data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  22. data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
  23. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
  27. data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
  28. data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
  29. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  30. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  31. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  32. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  33. data/lib/pragmatic_tokenizer/languages.rb +28 -28
  34. data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
  35. data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
  36. data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
  37. data/lib/pragmatic_tokenizer/version.rb +1 -1
  38. data/pragmatic_tokenizer.gemspec +1 -0
  39. data/spec/languages/bulgarian_spec.rb +17 -13
  40. data/spec/languages/deutsch_spec.rb +110 -86
  41. data/spec/languages/english_spec.rb +465 -342
  42. data/spec/languages/french_spec.rb +3 -2
  43. data/spec/performance_spec.rb +7 -7
  44. data/spec/pragmatic_tokenizer_spec.rb +8 -8
  45. metadata +18 -2
@@ -1,148 +1,178 @@
1
1
  module PragmaticTokenizer
2
- class PreProcessor
3
-
4
- def initialize(language: Languages::Common)
5
- @language = language
6
- end
7
-
8
- def pre_process(text:)
9
- shift_comma(text)
10
- shift_multiple_dash(text)
11
- shift_upsidedown_question_mark(text)
12
- shift_upsidedown_exclamation(text)
13
- shift_exclamation(text)
14
- shift_ellipse(text)
15
- shift_no_space_mention(text)
16
- shift_not_equals(text)
17
- shift_special_quotes(text)
18
- shift_colon(text)
19
- shift_bracket(text)
20
- shift_semicolon(text)
21
- shift_percent(text)
22
- shift_caret(text)
23
- shift_hashtag(text)
24
- shift_ampersand(text)
25
- shift_vertical_bar(text)
26
- convert_dbl_quotes(text)
27
- convert_sgl_quotes(text)
28
- convert_apostrophe_s(text)
29
- shift_beginning_hyphen(text)
30
- shift_ending_hyphen(text)
31
- text.squeeze(' ')
2
+ module PreProcessor
3
+
4
+ def pre_process(language: Languages::Common)
5
+ shift_comma!
6
+ shift_multiple_dash!
7
+ shift_inverted_question_mark!
8
+ shift_inverted_exclamation!
9
+ shift_exclamation!
10
+ shift_ellipse_three_dots!
11
+ shift_ellipse_two_dots!
12
+ shift_horizontal_ellipsis!
13
+ shift_no_space_mention!
14
+ shift_not_equals!
15
+ shift_special_quotes!
16
+ shift_colon!
17
+ shift_bracket!
18
+ shift_semicolon!
19
+ shift_percent!
20
+ shift_caret!
21
+ shift_hashtag!
22
+ shift_ampersand!
23
+ shift_vertical_bar!
24
+ convert_dbl_quotes!
25
+ convert_sgl_quotes!(language)
26
+ convert_apostrophe_s!
27
+ shift_beginning_hyphen!
28
+ shift_ending_hyphen!
29
+ squeeze(' '.freeze)
32
30
  end
33
31
 
34
32
  private
35
33
 
36
- def shift_comma(text)
37
34
  # Shift commas off everything but numbers
38
- text.gsub!(/,(?!\d)/o, ' , ') || text
39
- text.gsub!(/(?<=\D),(?=\S+)/, ' , ') || text
40
- end
35
+ def shift_comma!
36
+ gsub!(/,(?!\d)/o, ' , '.freeze)
37
+ gsub!(/(?<=\D),(?=\S+)/, ' , '.freeze)
38
+ end
41
39
 
42
- def shift_multiple_dash(text)
43
- text.gsub!(/--+/o, ' - ') || text
44
- end
40
+ def shift_multiple_dash!
41
+ gsub!(/--+/o, ' - '.freeze)
42
+ end
45
43
 
46
- def shift_upsidedown_question_mark(text)
47
- text.gsub!(/¿/, ' ¿ ') || text
48
- end
44
+ def shift_inverted_question_mark!
45
+ gsub!(/¿/, ' ¿ '.freeze)
46
+ end
49
47
 
50
- def shift_upsidedown_exclamation(text)
51
- text.gsub!(/¡/, ' ¡ ') || text
52
- end
48
+ def shift_inverted_exclamation!
49
+ gsub!(/¡/, ' ¡ '.freeze)
50
+ end
53
51
 
54
- def shift_exclamation(text)
55
- text.gsub!(/(?<=[a-zA-z])!(?=[a-zA-z])/, ' ! ') || text
56
- end
52
+ def shift_exclamation!
53
+ gsub!(/(?<=[a-zA-z])!(?=[a-zA-z])/, ' ! '.freeze)
54
+ end
57
55
 
58
- def shift_ellipse(text)
59
- text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
60
- text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
61
- text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text
62
- end
56
+ def shift_horizontal_ellipsis!
57
+ gsub!(/(…+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
58
+ end
63
59
 
64
- def shift_no_space_mention(text)
65
- text.gsub!(/\.(?=(@|@)[^\.]+(\s|\z))/, '. ') || text
66
- end
60
+ def shift_ellipse_two_dots!
61
+ gsub!(/(\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
62
+ end
67
63
 
68
- def shift_not_equals(text)
69
- text.gsub!(/≠/, ' ') || text
70
- end
64
+ def shift_ellipse_three_dots!
65
+ gsub!(/(\.\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
66
+ end
71
67
 
72
- def shift_special_quotes(text)
73
- text.gsub!(/«/, ' « ') || text
74
- text.gsub!(/»/, ' » ') || text
75
- text.gsub!(/„/, ' „ ') || text
76
- text.gsub!(/“/, ' “ ') || text
77
- end
68
+ def shift_no_space_mention!
69
+ gsub!(/\.(?=(@|@)[^\.]+(\s|\z))/, '. '.freeze)
70
+ end
78
71
 
79
- def shift_colon(text)
80
- return text unless text.include?(':') &&
81
- (text.partition(':').last[0] !~ /\A\d+/ ||
82
- text.partition(':').first[-1] !~ /\A\d+/)
83
- # Ignore web addresses
84
- text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
85
- text.gsub!(/:/o, ' :') || text
86
- text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
87
- end
72
+ def shift_not_equals!
73
+ gsub!(/≠/, ' '.freeze)
74
+ end
88
75
 
89
- def shift_bracket(text)
90
- text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
91
- end
76
+ def shift_special_quotes!
77
+ gsub!(/([«»„“])/, ' \1 ')
78
+ end
92
79
 
93
- def shift_semicolon(text)
94
- text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
95
- end
80
+ def shift_colon!
81
+ return unless may_shift_colon?
82
+ # Ignore web addresses
83
+ replacement = replacement_for_key(':'.freeze)
84
+ gsub!(%r{(?<=[(https?|ftp)]):(?=//)}, replacement)
85
+ gsub!(/:/o, ' :'.freeze)
86
+ gsub!(/(?<=\s):(?=\#)/, ': '.freeze)
87
+ end
96
88
 
97
- def shift_percent(text)
98
- text.gsub!(/(?<=\D)%(?=\d+)/, ' %') || text
99
- end
89
+ def may_shift_colon?
90
+ return false unless include?(':'.freeze)
91
+ partitions = partition(':'.freeze)
92
+ partitions.last[0] !~ /\A\d+/ || partitions.first[-1] !~ /\A\d+/
93
+ end
100
94
 
101
- def shift_caret(text)
102
- text.gsub!(/\^/, ' ^ ') || text
103
- end
95
+ def shift_bracket!
96
+ gsub!(/([\(\[\{\}\]\)])/o) { ' ' + Regexp.last_match(1) + ' '.freeze }
97
+ end
104
98
 
105
- def shift_hashtag(text)
106
- text.gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2') || text
107
- end
99
+ def shift_semicolon!
100
+ gsub!(/([;])/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
101
+ end
108
102
 
109
- def shift_ampersand(text)
110
- text.gsub!(/\&/, ' & ') || text
111
- end
103
+ def shift_percent!
104
+ gsub!(/(?<=\D)%(?=\d+)/, ' %'.freeze)
105
+ end
112
106
 
113
- def shift_vertical_bar(text)
114
- text.gsub!(/\|/, ' | ') || text
115
- end
107
+ def shift_caret!
108
+ gsub!(/\^/, ' ^ '.freeze)
109
+ end
116
110
 
117
- def convert_dbl_quotes(text)
118
- # Convert left double quotes to special character
119
- text.gsub!(/''(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
120
- text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
121
- text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
122
- # Convert remaining quotes to special character
123
- text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
124
- text.gsub!(/''/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
125
- text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
126
- end
111
+ def shift_hashtag!
112
+ gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2')
113
+ end
127
114
 
128
- def convert_sgl_quotes(text)
129
- if defined? @language::SingleQuotes
130
- @language::SingleQuotes.new.handle_single_quotes(text)
131
- else
132
- PragmaticTokenizer::Languages::Common::SingleQuotes.new.handle_single_quotes(text)
115
+ def shift_ampersand!
116
+ gsub!(/\&/, ' & '.freeze)
133
117
  end
134
- end
135
118
 
136
- def convert_apostrophe_s(text)
137
- text.gsub!(/\s\u{0301}(?=s(\s|\z))/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['`']) || text
138
- end
119
+ def shift_vertical_bar!
120
+ gsub!(/\|/, ' | '.freeze)
121
+ end
139
122
 
140
- def shift_beginning_hyphen(text)
141
- text.gsub!(/\s+-/, ' - ') || text
142
- end
123
+ def convert_dbl_quotes!
124
+ replace_left_double_quotes!
125
+ replace_remaining_double_quotes!
126
+ end
127
+
128
+ def replace_left_double_quotes!
129
+ replace_left_quotes!("''", '"'.freeze)
130
+ replace_left_quotes!('"', '"'.freeze)
131
+ replace_left_quotes!('“', '“'.freeze)
132
+ end
133
+
134
+ def replace_left_quotes!(style, replacement_key)
135
+ replacement = replacement_for_key(replacement_key)
136
+ gsub!(/#{style}(?=.*\w)/o, ' '.freeze + replacement + ' '.freeze)
137
+ end
138
+
139
+ def replace_remaining_double_quotes!
140
+ replace_remaining_quotes!('"', '"'.freeze)
141
+ replace_remaining_quotes!("''", '"'.freeze)
142
+ replace_remaining_quotes!('”', '”'.freeze)
143
+ end
144
+
145
+ def replace_remaining_quotes!(style, replacement_key)
146
+ replacement = replacement_for_key(replacement_key)
147
+ gsub!(/#{style}/, ' '.freeze + replacement + ' '.freeze)
148
+ end
149
+
150
+ def convert_sgl_quotes!(language)
151
+ replace(if defined?(language::SingleQuotes)
152
+ language::SingleQuotes.new
153
+ .handle_single_quotes(self)
154
+ else
155
+ PragmaticTokenizer::Languages::Common::SingleQuotes.new
156
+ .handle_single_quotes(self)
157
+ end)
158
+ end
159
+
160
+ def convert_apostrophe_s!
161
+ replacement = replacement_for_key('`'.freeze)
162
+ gsub!(/\s\u{0301}(?=s(\s|\z))/, replacement)
163
+ end
164
+
165
+ def shift_beginning_hyphen!
166
+ gsub!(/\s+-/, ' - '.freeze)
167
+ end
168
+
169
+ def shift_ending_hyphen!
170
+ gsub!(/-\s+/, ' - '.freeze)
171
+ end
172
+
173
+ def replacement_for_key(replacement_key)
174
+ PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[replacement_key]
175
+ end
143
176
 
144
- def shift_ending_hyphen(text)
145
- text.gsub!(/-\s+/, ' - ') || text
146
- end
147
177
  end
148
178
  end