reclassifier 0.1.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/reclassifier/version.rb +1 -1
- data/lib/reclassifier/word_hash.rb +5 -81
- data/spec/word_hash_spec.rb +10 -0
- metadata +1 -1
data/lib/reclassifier/version.rb
CHANGED
@@ -1,84 +1,8 @@
|
|
1
1
|
module Reclassifier::WordHash
|
2
|
-
CORPUS_SKIP_WORDS =
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
"are",
|
7
|
-
"also",
|
8
|
-
"an",
|
9
|
-
"and",
|
10
|
-
"as",
|
11
|
-
"at",
|
12
|
-
"but",
|
13
|
-
"by",
|
14
|
-
"came",
|
15
|
-
"can",
|
16
|
-
"cant",
|
17
|
-
"couldnt",
|
18
|
-
"did",
|
19
|
-
"didn",
|
20
|
-
"didnt",
|
21
|
-
"do",
|
22
|
-
"doesnt",
|
23
|
-
"dont",
|
24
|
-
"ever",
|
25
|
-
"first",
|
26
|
-
"from",
|
27
|
-
"have",
|
28
|
-
"her",
|
29
|
-
"here",
|
30
|
-
"him",
|
31
|
-
"how",
|
32
|
-
"i",
|
33
|
-
"if",
|
34
|
-
"in",
|
35
|
-
"into",
|
36
|
-
"is",
|
37
|
-
"isnt",
|
38
|
-
"it",
|
39
|
-
"itll",
|
40
|
-
"just",
|
41
|
-
"last",
|
42
|
-
"least",
|
43
|
-
"like",
|
44
|
-
"most",
|
45
|
-
"my",
|
46
|
-
"new",
|
47
|
-
"no",
|
48
|
-
"not",
|
49
|
-
"now",
|
50
|
-
"of",
|
51
|
-
"on",
|
52
|
-
"or",
|
53
|
-
"should",
|
54
|
-
"sinc",
|
55
|
-
"so",
|
56
|
-
"some",
|
57
|
-
"th",
|
58
|
-
"than",
|
59
|
-
"this",
|
60
|
-
"that",
|
61
|
-
"the",
|
62
|
-
"their",
|
63
|
-
"then",
|
64
|
-
"those",
|
65
|
-
"to",
|
66
|
-
"told",
|
67
|
-
"too",
|
68
|
-
"true",
|
69
|
-
"try",
|
70
|
-
"until",
|
71
|
-
"url",
|
72
|
-
"us",
|
73
|
-
"were",
|
74
|
-
"when",
|
75
|
-
"whether",
|
76
|
-
"while",
|
77
|
-
"with",
|
78
|
-
"within",
|
79
|
-
"yes",
|
80
|
-
"you",
|
81
|
-
"youll"]
|
2
|
+
CORPUS_SKIP_WORDS = %w(a again all along are also an and as at but by came can cant couldnt did didn didnt do
|
3
|
+
doesnt dont ever first from have her here him how i if in into is isnt it itll just last
|
4
|
+
least like most my new no not now of on or should sinc so some th than this that the
|
5
|
+
their then those to told too true try until url us were when whether while with within yes you youll)
|
82
6
|
|
83
7
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
84
8
|
# symbolized, and indexed to its frequency in the document.
|
@@ -88,7 +12,7 @@ module Reclassifier::WordHash
|
|
88
12
|
|
89
13
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
90
14
|
def clean_word_hash(string)
|
91
|
-
word_hash_for_words(string.gsub(/[^\w\s]/,"").split)
|
15
|
+
word_hash_for_words(string.gsub(/[^\w\s]/," ").split)
|
92
16
|
end
|
93
17
|
|
94
18
|
def word_hash_for_words(words)
|
data/spec/word_hash_spec.rb
CHANGED
@@ -15,5 +15,15 @@ describe Reclassifier::Bayes do
|
|
15
15
|
|
16
16
|
subject.clean_word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
|
17
17
|
end
|
18
|
+
|
19
|
+
it 'should convert non-word characters to spaces' do
|
20
|
+
subject.clean_word_hash('Payment-Transfer').should eq(:payment => 1, :transfer => 1)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
[:word_hash, :clean_word_hash].each do |method|
|
25
|
+
it "#{method} should trim each word" do
|
26
|
+
subject.send(method, "test test123 \t\t\t aaa").should eq(:test => 1, :test123 => 1, :aaa => 1)
|
27
|
+
end
|
18
28
|
end
|
19
29
|
end
|