pragmatic_tokenizer 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/pragmatic_tokenizer/tokenizer.rb +1 -1
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fb5eb30905feb61b0f41d91cca8a3d449b7ad42
|
4
|
+
data.tar.gz: b38edd49314a297b0d154dc5df4ce0c85e3a9519
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2340ad7a611d6b31f7d49685f75f036c9dd3e20222f85bd8d2b73a1dd7cb7a7253b62f6dde9244b3101913be8a346deefa0d77dc7c68016f50d95f935fcaab6e
|
7
|
+
data.tar.gz: 20402fb55744f2b58f8ceec560565f3fdb6063750b52234ee673205d6553d727da7f48e2e9e9debb6281b12d9f65c59912e7e54b18aef49349a2d55bea10a92c
|
data/README.md
CHANGED
@@ -61,7 +61,7 @@ Or install it yourself as:
|
|
61
61
|
##### `clean`
|
62
62
|
**default** = `'false'`
|
63
63
|
- `true`
|
64
|
-
Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™).
|
64
|
+
Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™). Also removes long tokens or tokens with a backslash.
|
65
65
|
- `false`
|
66
66
|
Leaves tokens as is.
|
67
67
|
|
@@ -65,7 +65,7 @@ module PragmaticTokenizer
|
|
65
65
|
|
66
66
|
def cleaner(tokens)
|
67
67
|
return tokens unless clean
|
68
|
-
tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) || t =~ /\A\.{2,}\z/ }
|
68
|
+
tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) || t =~ /\A\.{2,}\z/ || t.include?("\\") || t.length > 50 }
|
69
69
|
end
|
70
70
|
|
71
71
|
def remove_punctuation(tokens)
|