rblearn 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: baeb51c361fba549f4909d2d9dc082b56b07bc54
4
- data.tar.gz: 17855594b1a780753f5a6e708be8603aaa3497db
3
+ metadata.gz: 0fb5e3f456a1e221345f6217eb95da98592eaf81
4
+ data.tar.gz: d14a5e3ef23821b381e77545c2ed71c90da82e0d
5
5
  SHA512:
6
- metadata.gz: beb24d4020d67d1b9e7e6455d682433adee10848dfdb5e01a940a22d470778120713e05d2d0780173ec4d87aa0399895c97509f145768a2bbe13793e5bddddf0
7
- data.tar.gz: 060d38b1a4fd38c05f4937b447f037bab7f662ea13f9849443a5c479601f8781b149cc450f54602225aae0961e11dba0cd77432f10b4e12feb80cb3b32df654c
6
+ metadata.gz: 5b86f3d52a267b07277419d1dffc6f7ab461a8e524cf44af9de2c96e08b6e75fdb7a9ffac99f0f63534bfbb3994b325a426732cbad4f2740f582008e74996984
7
+ data.tar.gz: a393401ceac0d65543b3b1dfee6975bf82b0f695525f10702ed66abb4a64c481a250c24932effad3460b413eb25483d8eb88f91831fcca3a08397c03ef1525d1
@@ -1,100 +1,95 @@
1
1
 
2
2
  module Rblearn
3
3
 
4
- class CountVectorizer
5
-
4
+ class CountVectorizer
6
5
  # TODO: consider the access controll about all variables
7
- attr_accessor :feature_names, :doc_matrix, :token2index
8
-
9
- # tokenizer: lambda function :: string -> Array<string>
10
- # lowcase: whether if words are lowercases :: bool
11
- # stop_words: list of stop words :: Array<string>
12
- # max_features: limitation of feature size :: Float \in [0, 1]
13
- # TODO: by max_features, zero vectors are sometimes created.
14
- def initialize(tokenizer, lowercase=true, max_features=0.8)
15
- @tokenizer = tokenizer
16
- @lowercase = lowercase
17
-
18
- stop_words = Stopwords::STOP_WORDS
19
- stop_words.map! {|token| token.stem}
20
- stop_words.map! {|token| token.downcase} if @lowercase
21
- @stopwords = stop_words
22
- @max_feature = max_features
23
- end
24
-
25
- # features: Each documents' feature :: Array<String> -> NArray::Int64
26
- def fit_transform(features)
27
- all_vocaburaries = []
28
- word_frequency = Hash.new{|hash, key| hash[key] = 0}
29
- document_frequency = Hash.new{|hash, key| hash[key] = 0}
30
- word_tfidf_score = Hash.new{|hash, key| hash[key] = 0}
31
- document_size = features.size
32
-
33
- features.each do |feature|
34
- token_list = @tokenizer.call(feature)
35
-
36
- # compute tf-value
37
- token_list.each do |token|
38
- token.downcase! if @lowercase
39
- word_frequency[token] += 1
40
- end
41
-
42
- # compute df-value
6
+ attr_accessor :token2index
7
+
8
+ # tokenizer: lambda function :: string -> Array<string>
9
+ # lowcase: whether if words are lowercases :: bool
10
+ # stop_words: list of stop words :: Array<string>
11
+ # max_features: limitation of feature size :: Float \in [0, 1]
12
+ # TODO: by max_features, zero vectors are sometimes created.
13
+ def initialize(tokenizer, lowercase=true, max_features=0.5)
14
+ @tokenizer = tokenizer
15
+ @lowercase = lowercase
16
+
17
+ stop_words = Stopwords::STOP_WORDS + ['-', '--', '(', ')', "\\", "'", '"', '!', '?', ':', ';', '.', ',', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
18
+ stop_words.map! {|token| token.stem}
19
+ stop_words.map! {|token| token.downcase} if @lowercase
20
+ @stopwords = stop_words
21
+ @max_feature = max_features
22
+ end
23
+
24
+ def get_feature_names
25
+ @feature_names
26
+ end
27
+
28
+ # features: Each documents' feature :: Array<String> -> NArray::Int64
29
+ def fit_transform(features)
30
+ all_vocabularies = []
31
+ tf = Hash.new{|hash, token| hash[token] = 0}
32
+ df = Hash.new{|hash, token| hash[token] = 0}
33
+ tfidf = Hash.new{|hash, token| hash[token] = 0}
34
+
35
+ # features: Array<string>
36
+ features.each do |feature|
37
+ feature.downcase! if @lowercase
38
+ token_list = @tokenizer.call(feature).reject{|token| @stopwords.include?(token)}
39
+ all_vocabularies += token_list
40
+
41
+ token_list.each do |token|
42
+ tf[token] += 1
43
+ end
44
+
43
45
  token_list.uniq.each do |token|
44
- document_frequency[token] += 1
45
- all_vocaburaries << token
46
+ df[token] += 1
46
47
  end
47
- end
48
+ end
49
+
50
+ # to get the set of vocabulary
51
+ all_vocabularies.uniq!
52
+
53
+ tf.sort{|(_, v1), (_, v2)| v2 <=> v1}.first(20).each do |token, count|
54
+ tf[token] = 0
55
+ end
56
+
57
+ all_vocabularies.each do |token|
58
+ tfval = Math.log(tf[token])
59
+ idfval = Math.log(all_vocabularies.size.to_f / df[token]) + 1
60
+ tfidf[token] = tfval * idfval
61
+ end
62
+
63
+ tfidf = tfidf.sort{|(_, v1), (_, v2)| v2 <=> v1}
48
64
 
49
- all_vocaburaries.uniq!
50
- all_vocaburaries.each do |token|
51
- tf = 1 + Math.log(word_frequency[token])
52
- idf = Math.log(1+(document_size/document_frequency[token]))
53
- word_tfidf_score[token] = tf * idf
65
+ feature_names = (0...(tfidf.size * @max_feature).to_i).map{|i| tfidf[i][0]}
66
+ token2index = {}
67
+ feature_names.each_with_index do |token, i|
68
+ token2index[token] = i
54
69
  end
55
70
 
56
- word_tfidf_score = word_tfidf_score.sort{|(_, v1), (_, v2)| v2 <=> v1}
57
- feature_names = (0...(word_tfidf_score.size * @max_feature).to_i).map{|i| word_tfidf_score[i][0]}
58
-
59
- token2index = {}
60
- feature_names.each_with_index do |token, i|
61
- token2index[token] = i
62
- end
63
-
64
- doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
65
- features.each_with_index do |feature, doc_id|
66
- tokens = []
67
- @tokenizer.call(feature).each do |token|
68
- token.downcase! if @lowercase
69
- tokens << token unless @stopwords.include?(token)
70
- end
71
-
72
- # BoW representation
73
- counter = Hash.new{|hash, key| hash[key] = 0}
74
- tokens.each do |token|
75
- counter[token] += 1
76
- end
77
-
78
- counter.each do |token, freq|
79
- doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
80
- end
81
- end
82
-
83
- @doc_matrix = doc_matrix
84
- @feature_names = feature_names
85
- @token2index = token2index
86
- return @doc_matrix
87
- end
88
- end
89
-
90
-
91
-
92
- if __FILE__ == $0
93
- cv = CountVectorizer.new(lambda{|s| s.split.map{|token| token.stem}}, 1, 0.8)
94
- features = ['I am train man which automata and philosophy', 'numerical analysis young man', 'logic programmer']
95
- p cv.fit_transform(features)
96
- p cv.feature_names
97
- p cv.token2index
98
- end
71
+ doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
72
+ features.each_with_index do |feature, doc_id|
73
+ tokens = []
74
+ @tokenizer.call(feature).each do |token|
75
+ token.downcase! if @lowercase
76
+ tokens << token unless @stopwords.include?(token)
77
+ end
78
+
79
+ # BoW representation
80
+ counter = Hash.new{|hash, key| hash[key] = 0}
81
+ tokens.each do |token|
82
+ counter[token] += 1
83
+ end
84
+
85
+ counter.each do |token, freq|
86
+ doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
87
+ end
88
+ end
99
89
 
90
+ @feature_names = feature_names
91
+ @token2index = token2index
92
+ return doc_matrix
93
+ end
94
+ end
100
95
  end
@@ -1,3 +1,3 @@
1
1
  module Rblearn
2
- VERSION = "0.2.2"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rblearn
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - himkt
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-08-01 00:00:00.000000000 Z
11
+ date: 2016-08-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler