pragmatic_tokenizer 0.1.9 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0cad1673af08205c4d74c54e33a5c81af1868db1
4
- data.tar.gz: 1c69082577b4a35d6cccb5120eb1f05520d6724f
3
+ metadata.gz: 4b2dd34906b6a89945200b65353a772089c2fdb2
4
+ data.tar.gz: dceed8cdbb9cefcd822ddf68cc99073b76aa9506
5
5
  SHA512:
6
- metadata.gz: 15ba9389b8c229575c00834be9fb9eb43ebb5b674f2ad03ef19f2da1818e7152aaaece1af0e86cbc7a215bce7dd19075487c3bd865c56b205a6f06cd6c5659d5
7
- data.tar.gz: fd97185a77a8cc88809a4a55387d28bb78b48833a1e8974ac59b583aee5ea18c9ae81e237180e0a4885443dd03efc2fd57802d1c2c199bc950dc567f160cb63e
6
+ metadata.gz: 21d892cc075635b1d1ba42c122421bb237bbc01b35ab2068648fb8c4eb9725a2174f22b89a41976d9df06546bdf92e1d4f4f8520a409edf1d6c2c62df1faf8cc
7
+ data.tar.gz: 2750c610896eeb49f8218e666be12f6deeb5675fd6bef7678f826d07031fa0fba94fd5113e0473f263899f20d5e740c4117748d48621e2a51fe1c3f835275a1a
@@ -14,6 +14,7 @@ module PragmaticTokenizer
14
14
  shift_special_quotes(text)
15
15
  shift_colon(text)
16
16
  shift_bracket(text)
17
+ shift_semicolon(text)
17
18
  convert_dbl_quotes(text)
18
19
  convert_sgl_quotes(text)
19
20
  tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t.downcase) })
@@ -74,6 +75,10 @@ module PragmaticTokenizer
74
75
  text.gsub!(/:/o, ' :') || text
75
76
  end
76
77
 
78
+ def shift_semicolon(text)
79
+ text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
80
+ end
81
+
77
82
  def shift_ellipse(text)
78
83
  text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
79
84
  end
@@ -65,7 +65,13 @@ module PragmaticTokenizer
65
65
 
66
66
  def cleaner(tokens)
67
67
  return tokens unless clean
68
- tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) || t =~ /\A\.{2,}\z/ || t.include?("\\") || t.length > 50 }
68
+ tokens.delete_if { |t| t =~ /\A_+\z/ ||
69
+ t =~ /\A-+\z/ ||
70
+ PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
71
+ t =~ /\A\.{2,}\z/ || t.include?("\\") ||
72
+ t.length > 50 ||
73
+ (t.length > 1 && t =~ /[#&*+<=>@^|~]/i)
74
+ }
69
75
  end
70
76
 
71
77
  def remove_punctuation(tokens)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.9"
2
+ VERSION = "0.1.10"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.1.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias