pragmatic_tokenizer 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 478c9f5259d69418221bf4513fba8e131db9c52b
4
- data.tar.gz: b01f4af0670b8a080f896f32ec1bcd2a06a128f0
3
+ metadata.gz: 0fb5eb30905feb61b0f41d91cca8a3d449b7ad42
4
+ data.tar.gz: b38edd49314a297b0d154dc5df4ce0c85e3a9519
5
5
  SHA512:
6
- metadata.gz: 17060aab4cb4228f369a93ec71a768c6729621842b515b351e645e24f1fa79ec9aebb2325c19cf81dd8b02ef44a90686883135b7182e32459a99f6fbf2904ba3
7
- data.tar.gz: f0efdaa14b3bd0f2ab185e02b9a26b74ecb5822f8b199f71168fd451441ba99b403fa11033ff3b1a804337df117b4e092506ccd294deecbcd50de2a30a6689fa
6
+ metadata.gz: 2340ad7a611d6b31f7d49685f75f036c9dd3e20222f85bd8d2b73a1dd7cb7a7253b62f6dde9244b3101913be8a346deefa0d77dc7c68016f50d95f935fcaab6e
7
+ data.tar.gz: 20402fb55744f2b58f8ceec560565f3fdb6063750b52234ee673205d6553d727da7f48e2e9e9debb6281b12d9f65c59912e7e54b18aef49349a2d55bea10a92c
data/README.md CHANGED
@@ -61,7 +61,7 @@ Or install it yourself as:
61
61
  ##### `clean`
62
62
  **default** = `'false'`
63
63
  - `true`
64
- Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™).
64
+ Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™). Also removes long tokens or tokens with a backslash.
65
65
  - `false`
66
66
  Leaves tokens as is.
67
67
 
@@ -65,7 +65,7 @@ module PragmaticTokenizer
65
65
 
66
66
  def cleaner(tokens)
67
67
  return tokens unless clean
68
- tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) || t =~ /\A\.{2,}\z/ }
68
+ tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) || t =~ /\A\.{2,}\z/ || t.include?("\\") || t.length > 50 }
69
69
  end
70
70
 
71
71
  def remove_punctuation(tokens)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.7"
2
+ VERSION = "0.1.8"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias