reclassifier 0.1.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/reclassifier/version.rb +1 -1
- data/lib/reclassifier/word_hash.rb +5 -81
- data/spec/word_hash_spec.rb +10 -0
- metadata +1 -1
data/lib/reclassifier/version.rb
CHANGED
@@ -1,84 +1,8 @@
|
|
1
1
|
module Reclassifier::WordHash
|
2
|
-
CORPUS_SKIP_WORDS =
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
"are",
|
7
|
-
"also",
|
8
|
-
"an",
|
9
|
-
"and",
|
10
|
-
"as",
|
11
|
-
"at",
|
12
|
-
"but",
|
13
|
-
"by",
|
14
|
-
"came",
|
15
|
-
"can",
|
16
|
-
"cant",
|
17
|
-
"couldnt",
|
18
|
-
"did",
|
19
|
-
"didn",
|
20
|
-
"didnt",
|
21
|
-
"do",
|
22
|
-
"doesnt",
|
23
|
-
"dont",
|
24
|
-
"ever",
|
25
|
-
"first",
|
26
|
-
"from",
|
27
|
-
"have",
|
28
|
-
"her",
|
29
|
-
"here",
|
30
|
-
"him",
|
31
|
-
"how",
|
32
|
-
"i",
|
33
|
-
"if",
|
34
|
-
"in",
|
35
|
-
"into",
|
36
|
-
"is",
|
37
|
-
"isnt",
|
38
|
-
"it",
|
39
|
-
"itll",
|
40
|
-
"just",
|
41
|
-
"last",
|
42
|
-
"least",
|
43
|
-
"like",
|
44
|
-
"most",
|
45
|
-
"my",
|
46
|
-
"new",
|
47
|
-
"no",
|
48
|
-
"not",
|
49
|
-
"now",
|
50
|
-
"of",
|
51
|
-
"on",
|
52
|
-
"or",
|
53
|
-
"should",
|
54
|
-
"sinc",
|
55
|
-
"so",
|
56
|
-
"some",
|
57
|
-
"th",
|
58
|
-
"than",
|
59
|
-
"this",
|
60
|
-
"that",
|
61
|
-
"the",
|
62
|
-
"their",
|
63
|
-
"then",
|
64
|
-
"those",
|
65
|
-
"to",
|
66
|
-
"told",
|
67
|
-
"too",
|
68
|
-
"true",
|
69
|
-
"try",
|
70
|
-
"until",
|
71
|
-
"url",
|
72
|
-
"us",
|
73
|
-
"were",
|
74
|
-
"when",
|
75
|
-
"whether",
|
76
|
-
"while",
|
77
|
-
"with",
|
78
|
-
"within",
|
79
|
-
"yes",
|
80
|
-
"you",
|
81
|
-
"youll"]
|
2
|
+
CORPUS_SKIP_WORDS = %w(a again all along are also an and as at but by came can cant couldnt did didn didnt do
|
3
|
+
doesnt dont ever first from have her here him how i if in into is isnt it itll just last
|
4
|
+
least like most my new no not now of on or should sinc so some th than this that the
|
5
|
+
their then those to told too true try until url us were when whether while with within yes you youll)
|
82
6
|
|
83
7
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
84
8
|
# symbolized, and indexed to its frequency in the document.
|
@@ -88,7 +12,7 @@ module Reclassifier::WordHash
|
|
88
12
|
|
89
13
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
90
14
|
def clean_word_hash(string)
|
91
|
-
word_hash_for_words(string.gsub(/[^\w\s]/,"").split)
|
15
|
+
word_hash_for_words(string.gsub(/[^\w\s]/," ").split)
|
92
16
|
end
|
93
17
|
|
94
18
|
def word_hash_for_words(words)
|
data/spec/word_hash_spec.rb
CHANGED
@@ -15,5 +15,15 @@ describe Reclassifier::Bayes do
|
|
15
15
|
|
16
16
|
subject.clean_word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
|
17
17
|
end
|
18
|
+
|
19
|
+
it 'should convert non-word characters to spaces' do
|
20
|
+
subject.clean_word_hash('Payment-Transfer').should eq(:payment => 1, :transfer => 1)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
[:word_hash, :clean_word_hash].each do |method|
|
25
|
+
it "#{method} should trim each word" do
|
26
|
+
subject.send(method, "test test123 \t\t\t aaa").should eq(:test => 1, :test123 => 1, :aaa => 1)
|
27
|
+
end
|
18
28
|
end
|
19
29
|
end
|