rblearn 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rblearn/CountVectorizer.rb +84 -89
- data/lib/rblearn/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fb5e3f456a1e221345f6217eb95da98592eaf81
|
4
|
+
data.tar.gz: d14a5e3ef23821b381e77545c2ed71c90da82e0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5b86f3d52a267b07277419d1dffc6f7ab461a8e524cf44af9de2c96e08b6e75fdb7a9ffac99f0f63534bfbb3994b325a426732cbad4f2740f582008e74996984
|
7
|
+
data.tar.gz: a393401ceac0d65543b3b1dfee6975bf82b0f695525f10702ed66abb4a64c481a250c24932effad3460b413eb25483d8eb88f91831fcca3a08397c03ef1525d1
|
@@ -1,100 +1,95 @@
|
|
1
1
|
|
2
2
|
module Rblearn
|
3
3
|
|
4
|
-
|
5
|
-
|
4
|
+
class CountVectorizer
|
6
5
|
# TODO: consider the access controll about all variables
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
6
|
+
attr_accessor :token2index
|
7
|
+
|
8
|
+
# tokenizer: lambda function :: string -> Array<string>
|
9
|
+
# lowcase: whether if words are lowercases :: bool
|
10
|
+
# stop_words: list of stop words :: Array<string>
|
11
|
+
# max_features: limitation of feature size :: Float \in [0, 1]
|
12
|
+
# TODO: by max_features, zero vectors are sometimes created.
|
13
|
+
def initialize(tokenizer, lowercase=true, max_features=0.5)
|
14
|
+
@tokenizer = tokenizer
|
15
|
+
@lowercase = lowercase
|
16
|
+
|
17
|
+
stop_words = Stopwords::STOP_WORDS + ['-', '--', '(', ')', "\\", "'", '"', '!', '?', ':', ';', '.', ',', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
|
18
|
+
stop_words.map! {|token| token.stem}
|
19
|
+
stop_words.map! {|token| token.downcase} if @lowercase
|
20
|
+
@stopwords = stop_words
|
21
|
+
@max_feature = max_features
|
22
|
+
end
|
23
|
+
|
24
|
+
def get_feature_names
|
25
|
+
@feature_names
|
26
|
+
end
|
27
|
+
|
28
|
+
# features: Each documents' feature :: Array<String> -> NArray::Int64
|
29
|
+
def fit_transform(features)
|
30
|
+
all_vocabularies = []
|
31
|
+
tf = Hash.new{|hash, token| hash[token] = 0}
|
32
|
+
df = Hash.new{|hash, token| hash[token] = 0}
|
33
|
+
tfidf = Hash.new{|hash, token| hash[token] = 0}
|
34
|
+
|
35
|
+
# features: Array<string>
|
36
|
+
features.each do |feature|
|
37
|
+
feature.downcase! if @lowercase
|
38
|
+
token_list = @tokenizer.call(feature).reject{|token| @stopwords.include?(token)}
|
39
|
+
all_vocabularies += token_list
|
40
|
+
|
41
|
+
token_list.each do |token|
|
42
|
+
tf[token] += 1
|
43
|
+
end
|
44
|
+
|
43
45
|
token_list.uniq.each do |token|
|
44
|
-
|
45
|
-
all_vocaburaries << token
|
46
|
+
df[token] += 1
|
46
47
|
end
|
47
|
-
|
48
|
+
end
|
49
|
+
|
50
|
+
# to get the set of vocabulary
|
51
|
+
all_vocabularies.uniq!
|
52
|
+
|
53
|
+
tf.sort{|(_, v1), (_, v2)| v2 <=> v1}.first(20).each do |token, count|
|
54
|
+
tf[token] = 0
|
55
|
+
end
|
56
|
+
|
57
|
+
all_vocabularies.each do |token|
|
58
|
+
tfval = Math.log(tf[token])
|
59
|
+
idfval = Math.log(all_vocabularies.size.to_f / df[token]) + 1
|
60
|
+
tfidf[token] = tfval * idfval
|
61
|
+
end
|
62
|
+
|
63
|
+
tfidf = tfidf.sort{|(_, v1), (_, v2)| v2 <=> v1}
|
48
64
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
word_tfidf_score[token] = tf * idf
|
65
|
+
feature_names = (0...(tfidf.size * @max_feature).to_i).map{|i| tfidf[i][0]}
|
66
|
+
token2index = {}
|
67
|
+
feature_names.each_with_index do |token, i|
|
68
|
+
token2index[token] = i
|
54
69
|
end
|
55
70
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
tokens.each do |token|
|
75
|
-
counter[token] += 1
|
76
|
-
end
|
77
|
-
|
78
|
-
counter.each do |token, freq|
|
79
|
-
doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
@doc_matrix = doc_matrix
|
84
|
-
@feature_names = feature_names
|
85
|
-
@token2index = token2index
|
86
|
-
return @doc_matrix
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
if __FILE__ == $0
|
93
|
-
cv = CountVectorizer.new(lambda{|s| s.split.map{|token| token.stem}}, 1, 0.8)
|
94
|
-
features = ['I am train man which automata and philosophy', 'numerical analysis young man', 'logic programmer']
|
95
|
-
p cv.fit_transform(features)
|
96
|
-
p cv.feature_names
|
97
|
-
p cv.token2index
|
98
|
-
end
|
71
|
+
doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
|
72
|
+
features.each_with_index do |feature, doc_id|
|
73
|
+
tokens = []
|
74
|
+
@tokenizer.call(feature).each do |token|
|
75
|
+
token.downcase! if @lowercase
|
76
|
+
tokens << token unless @stopwords.include?(token)
|
77
|
+
end
|
78
|
+
|
79
|
+
# BoW representation
|
80
|
+
counter = Hash.new{|hash, key| hash[key] = 0}
|
81
|
+
tokens.each do |token|
|
82
|
+
counter[token] += 1
|
83
|
+
end
|
84
|
+
|
85
|
+
counter.each do |token, freq|
|
86
|
+
doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
|
87
|
+
end
|
88
|
+
end
|
99
89
|
|
90
|
+
@feature_names = feature_names
|
91
|
+
@token2index = token2index
|
92
|
+
return doc_matrix
|
93
|
+
end
|
94
|
+
end
|
100
95
|
end
|
data/lib/rblearn/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rblearn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- himkt
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-08-
|
11
|
+
date: 2016-08-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|