pragmatic_tokenizer 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9defa80663e9e5955967ba95d1bd346b5e06cca3
|
4
|
+
data.tar.gz: 5f3ec1a9392c2664b3ce8a26a7de9ba03e3a22e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ddac620c5e335ee8ec3daa1c78ed96d9637e37d57dbb34886a153d7f292bad173ca823c291c44fc1a6f047276049d2e96562ab0a477c5cc58d6f0385636b3ba3
|
7
|
+
data.tar.gz: 9361da2cada5ab2e301601f4ddd50bde3fe7935f1b88447a3c5356b72063357a319471b4f1405f2c7aec3fa07f8c794f7a3aadc24a0e82919d2b829411725505
|
@@ -23,6 +23,7 @@ module PragmaticTokenizer
|
|
23
23
|
shift_vertical_bar(text)
|
24
24
|
convert_dbl_quotes(text)
|
25
25
|
convert_sgl_quotes(text)
|
26
|
+
convert_apostrophe_s(text)
|
26
27
|
shift_beginning_hyphen(text)
|
27
28
|
shift_ending_hyphen(text)
|
28
29
|
text.squeeze(' ')
|
@@ -120,6 +121,11 @@ module PragmaticTokenizer
|
|
120
121
|
end
|
121
122
|
end
|
122
123
|
|
124
|
+
def convert_apostrophe_s(text)
|
125
|
+
puts "Text: #{text.include?("\u{0301}")}"
|
126
|
+
text.gsub!(/\s\u{0301}(?=s(\s|\z))/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['`']) || text
|
127
|
+
end
|
128
|
+
|
123
129
|
def shift_beginning_hyphen(text)
|
124
130
|
text.gsub!(/\s+-/, ' - ') || text
|
125
131
|
end
|
@@ -122,7 +122,9 @@ module PragmaticTokenizer
|
|
122
122
|
private
|
123
123
|
|
124
124
|
def post_process(text)
|
125
|
+
puts "Text: #{text}"
|
125
126
|
@tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
|
127
|
+
puts "Tokens: #{@tokens}"
|
126
128
|
downcase! if downcase
|
127
129
|
expand_contractions!(contractions) if expand_contractions
|
128
130
|
clean! if clean
|