pragmatic_tokenizer 0.1.9 → 0.1.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0cad1673af08205c4d74c54e33a5c81af1868db1
4
- data.tar.gz: 1c69082577b4a35d6cccb5120eb1f05520d6724f
3
+ metadata.gz: 4b2dd34906b6a89945200b65353a772089c2fdb2
4
+ data.tar.gz: dceed8cdbb9cefcd822ddf68cc99073b76aa9506
5
5
  SHA512:
6
- metadata.gz: 15ba9389b8c229575c00834be9fb9eb43ebb5b674f2ad03ef19f2da1818e7152aaaece1af0e86cbc7a215bce7dd19075487c3bd865c56b205a6f06cd6c5659d5
7
- data.tar.gz: fd97185a77a8cc88809a4a55387d28bb78b48833a1e8974ac59b583aee5ea18c9ae81e237180e0a4885443dd03efc2fd57802d1c2c199bc950dc567f160cb63e
6
+ metadata.gz: 21d892cc075635b1d1ba42c122421bb237bbc01b35ab2068648fb8c4eb9725a2174f22b89a41976d9df06546bdf92e1d4f4f8520a409edf1d6c2c62df1faf8cc
7
+ data.tar.gz: 2750c610896eeb49f8218e666be12f6deeb5675fd6bef7678f826d07031fa0fba94fd5113e0473f263899f20d5e740c4117748d48621e2a51fe1c3f835275a1a
@@ -14,6 +14,7 @@ module PragmaticTokenizer
14
14
  shift_special_quotes(text)
15
15
  shift_colon(text)
16
16
  shift_bracket(text)
17
+ shift_semicolon(text)
17
18
  convert_dbl_quotes(text)
18
19
  convert_sgl_quotes(text)
19
20
  tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t.downcase) })
@@ -74,6 +75,10 @@ module PragmaticTokenizer
74
75
  text.gsub!(/:/o, ' :') || text
75
76
  end
76
77
 
78
+ def shift_semicolon(text)
79
+ text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
80
+ end
81
+
77
82
  def shift_ellipse(text)
78
83
  text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
79
84
  end
@@ -65,7 +65,13 @@ module PragmaticTokenizer
65
65
 
66
66
  def cleaner(tokens)
67
67
  return tokens unless clean
68
- tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) || t =~ /\A\.{2,}\z/ || t.include?("\\") || t.length > 50 }
68
+ tokens.delete_if { |t| t =~ /\A_+\z/ ||
69
+ t =~ /\A-+\z/ ||
70
+ PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
71
+ t =~ /\A\.{2,}\z/ || t.include?("\\") ||
72
+ t.length > 50 ||
73
+ (t.length > 1 && t =~ /[#&*+<=>@^|~]/i)
74
+ }
69
75
  end
70
76
 
71
77
  def remove_punctuation(tokens)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.9"
2
+ VERSION = "0.1.10"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.1.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias