rblearn 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rblearn/CountVectorizer.rb +84 -89
- data/lib/rblearn/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0fb5e3f456a1e221345f6217eb95da98592eaf81
|
4
|
+
data.tar.gz: d14a5e3ef23821b381e77545c2ed71c90da82e0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5b86f3d52a267b07277419d1dffc6f7ab461a8e524cf44af9de2c96e08b6e75fdb7a9ffac99f0f63534bfbb3994b325a426732cbad4f2740f582008e74996984
|
7
|
+
data.tar.gz: a393401ceac0d65543b3b1dfee6975bf82b0f695525f10702ed66abb4a64c481a250c24932effad3460b413eb25483d8eb88f91831fcca3a08397c03ef1525d1
|
@@ -1,100 +1,95 @@
|
|
1
1
|
|
2
2
|
module Rblearn
|
3
3
|
|
4
|
-
|
5
|
-
|
4
|
+
class CountVectorizer
|
6
5
|
# TODO: consider the access controll about all variables
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
6
|
+
attr_accessor :token2index
|
7
|
+
|
8
|
+
# tokenizer: lambda function :: string -> Array<string>
|
9
|
+
# lowcase: whether if words are lowercases :: bool
|
10
|
+
# stop_words: list of stop words :: Array<string>
|
11
|
+
# max_features: limitation of feature size :: Float \in [0, 1]
|
12
|
+
# TODO: by max_features, zero vectors are sometimes created.
|
13
|
+
def initialize(tokenizer, lowercase=true, max_features=0.5)
|
14
|
+
@tokenizer = tokenizer
|
15
|
+
@lowercase = lowercase
|
16
|
+
|
17
|
+
stop_words = Stopwords::STOP_WORDS + ['-', '--', '(', ')', "\\", "'", '"', '!', '?', ':', ';', '.', ',', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
|
18
|
+
stop_words.map! {|token| token.stem}
|
19
|
+
stop_words.map! {|token| token.downcase} if @lowercase
|
20
|
+
@stopwords = stop_words
|
21
|
+
@max_feature = max_features
|
22
|
+
end
|
23
|
+
|
24
|
+
def get_feature_names
|
25
|
+
@feature_names
|
26
|
+
end
|
27
|
+
|
28
|
+
# features: Each documents' feature :: Array<String> -> NArray::Int64
|
29
|
+
def fit_transform(features)
|
30
|
+
all_vocabularies = []
|
31
|
+
tf = Hash.new{|hash, token| hash[token] = 0}
|
32
|
+
df = Hash.new{|hash, token| hash[token] = 0}
|
33
|
+
tfidf = Hash.new{|hash, token| hash[token] = 0}
|
34
|
+
|
35
|
+
# features: Array<string>
|
36
|
+
features.each do |feature|
|
37
|
+
feature.downcase! if @lowercase
|
38
|
+
token_list = @tokenizer.call(feature).reject{|token| @stopwords.include?(token)}
|
39
|
+
all_vocabularies += token_list
|
40
|
+
|
41
|
+
token_list.each do |token|
|
42
|
+
tf[token] += 1
|
43
|
+
end
|
44
|
+
|
43
45
|
token_list.uniq.each do |token|
|
44
|
-
|
45
|
-
all_vocaburaries << token
|
46
|
+
df[token] += 1
|
46
47
|
end
|
47
|
-
|
48
|
+
end
|
49
|
+
|
50
|
+
# to get the set of vocabulary
|
51
|
+
all_vocabularies.uniq!
|
52
|
+
|
53
|
+
tf.sort{|(_, v1), (_, v2)| v2 <=> v1}.first(20).each do |token, count|
|
54
|
+
tf[token] = 0
|
55
|
+
end
|
56
|
+
|
57
|
+
all_vocabularies.each do |token|
|
58
|
+
tfval = Math.log(tf[token])
|
59
|
+
idfval = Math.log(all_vocabularies.size.to_f / df[token]) + 1
|
60
|
+
tfidf[token] = tfval * idfval
|
61
|
+
end
|
62
|
+
|
63
|
+
tfidf = tfidf.sort{|(_, v1), (_, v2)| v2 <=> v1}
|
48
64
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
word_tfidf_score[token] = tf * idf
|
65
|
+
feature_names = (0...(tfidf.size * @max_feature).to_i).map{|i| tfidf[i][0]}
|
66
|
+
token2index = {}
|
67
|
+
feature_names.each_with_index do |token, i|
|
68
|
+
token2index[token] = i
|
54
69
|
end
|
55
70
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
tokens.each do |token|
|
75
|
-
counter[token] += 1
|
76
|
-
end
|
77
|
-
|
78
|
-
counter.each do |token, freq|
|
79
|
-
doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
@doc_matrix = doc_matrix
|
84
|
-
@feature_names = feature_names
|
85
|
-
@token2index = token2index
|
86
|
-
return @doc_matrix
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
if __FILE__ == $0
|
93
|
-
cv = CountVectorizer.new(lambda{|s| s.split.map{|token| token.stem}}, 1, 0.8)
|
94
|
-
features = ['I am train man which automata and philosophy', 'numerical analysis young man', 'logic programmer']
|
95
|
-
p cv.fit_transform(features)
|
96
|
-
p cv.feature_names
|
97
|
-
p cv.token2index
|
98
|
-
end
|
71
|
+
doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
|
72
|
+
features.each_with_index do |feature, doc_id|
|
73
|
+
tokens = []
|
74
|
+
@tokenizer.call(feature).each do |token|
|
75
|
+
token.downcase! if @lowercase
|
76
|
+
tokens << token unless @stopwords.include?(token)
|
77
|
+
end
|
78
|
+
|
79
|
+
# BoW representation
|
80
|
+
counter = Hash.new{|hash, key| hash[key] = 0}
|
81
|
+
tokens.each do |token|
|
82
|
+
counter[token] += 1
|
83
|
+
end
|
84
|
+
|
85
|
+
counter.each do |token, freq|
|
86
|
+
doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
|
87
|
+
end
|
88
|
+
end
|
99
89
|
|
90
|
+
@feature_names = feature_names
|
91
|
+
@token2index = token2index
|
92
|
+
return doc_matrix
|
93
|
+
end
|
94
|
+
end
|
100
95
|
end
|
data/lib/rblearn/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rblearn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- himkt
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-08-
|
11
|
+
date: 2016-08-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|