pragmatic_tokenizer 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f19baff424f25d8ba1fcf08001dfd6fc5ac4c54b
|
4
|
+
data.tar.gz: 645aaae817dd1a77c7fbfe506e5febe9746afbf7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c4a725af2141a66f0390258b36bc66a50994cc6f44e2db6b528b28e592df50baf1865d4445012dead0bf95a1347585acd2b93b22ba968ddf3a5fc08d4bcdaf79
|
7
|
+
data.tar.gz: 0a820f57952cdc0adb8973607b083bc0e14ba5006a7cf4ab5515f874c1a6f3bc5d1ce532b895c3011203ae74555480a7b81431906607ae26a376dcc5dd4a7d66
|
@@ -11,8 +11,8 @@ module PragmaticTokenizer
|
|
11
11
|
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
12
12
|
text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
|
13
13
|
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
14
|
-
text.gsub!(/l\'/, '\1 l
|
15
|
-
text.gsub!(/L\'/, '\1 L
|
14
|
+
text.gsub!(/l\'/, '\1 l☮ \2') || text
|
15
|
+
text.gsub!(/L\'/, '\1 L☮ \2') || text
|
16
16
|
end
|
17
17
|
end
|
18
18
|
end
|
@@ -22,7 +22,11 @@ module PragmaticTokenizer
|
|
22
22
|
convert_sgl_quotes(text)
|
23
23
|
shift_beginning_hyphen(text)
|
24
24
|
shift_ending_hyphen(text)
|
25
|
-
tokens = separate_full_stop(text.squeeze(' ')
|
25
|
+
tokens = separate_full_stop(text.squeeze(' ')
|
26
|
+
.split
|
27
|
+
.flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t }
|
28
|
+
.flat_map { |t| (t[-1] == '’' || t[-1] == "'") && t.length > 1 ? t.split(/(’|')/).flatten : t }
|
29
|
+
.map { |t| convert_sym_to_punct(t) })
|
26
30
|
separate_other_ending_punc(tokens)
|
27
31
|
end
|
28
32
|
|