reclassifier 0.1.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
1
  module Reclassifier
2
- VERSION = "0.1.4"
2
+ VERSION = "0.2.5"
3
3
  end
@@ -1,84 +1,8 @@
1
1
  module Reclassifier::WordHash
2
- CORPUS_SKIP_WORDS = ["a",
3
- "again",
4
- "all",
5
- "along",
6
- "are",
7
- "also",
8
- "an",
9
- "and",
10
- "as",
11
- "at",
12
- "but",
13
- "by",
14
- "came",
15
- "can",
16
- "cant",
17
- "couldnt",
18
- "did",
19
- "didn",
20
- "didnt",
21
- "do",
22
- "doesnt",
23
- "dont",
24
- "ever",
25
- "first",
26
- "from",
27
- "have",
28
- "her",
29
- "here",
30
- "him",
31
- "how",
32
- "i",
33
- "if",
34
- "in",
35
- "into",
36
- "is",
37
- "isnt",
38
- "it",
39
- "itll",
40
- "just",
41
- "last",
42
- "least",
43
- "like",
44
- "most",
45
- "my",
46
- "new",
47
- "no",
48
- "not",
49
- "now",
50
- "of",
51
- "on",
52
- "or",
53
- "should",
54
- "sinc",
55
- "so",
56
- "some",
57
- "th",
58
- "than",
59
- "this",
60
- "that",
61
- "the",
62
- "their",
63
- "then",
64
- "those",
65
- "to",
66
- "told",
67
- "too",
68
- "true",
69
- "try",
70
- "until",
71
- "url",
72
- "us",
73
- "were",
74
- "when",
75
- "whether",
76
- "while",
77
- "with",
78
- "within",
79
- "yes",
80
- "you",
81
- "youll"]
2
+ CORPUS_SKIP_WORDS = %w(a again all along are also an and as at but by came can cant couldnt did didn didnt do
3
+ doesnt dont ever first from have her here him how i if in into is isnt it itll just last
4
+ least like most my new no not now of on or should sinc so some th than this that the
5
+ their then those to told too true try until url us were when whether while with within yes you youll)
82
6
 
83
7
  # Return a Hash of strings => ints. Each word in the string is stemmed,
84
8
  # symbolized, and indexed to its frequency in the document.
@@ -88,7 +12,7 @@ module Reclassifier::WordHash
88
12
 
89
13
  # Return a word hash without extra punctuation or short symbols, just stemmed words
90
14
  def clean_word_hash(string)
91
- word_hash_for_words(string.gsub(/[^\w\s]/,"").split)
15
+ word_hash_for_words(string.gsub(/[^\w\s]/," ").split)
92
16
  end
93
17
 
94
18
  def word_hash_for_words(words)
@@ -15,5 +15,15 @@ describe Reclassifier::Bayes do
15
15
 
16
16
  subject.clean_word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
17
17
  end
18
+
19
+ it 'should convert non-word characters to spaces' do
20
+ subject.clean_word_hash('Payment-Transfer').should eq(:payment => 1, :transfer => 1)
21
+ end
22
+ end
23
+
24
+ [:word_hash, :clean_word_hash].each do |method|
25
+ it "#{method} should trim each word" do
26
+ subject.send(method, "test test123 \t\t\t aaa").should eq(:test => 1, :test123 => 1, :aaa => 1)
27
+ end
18
28
  end
19
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reclassifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: