rblearn 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: baeb51c361fba549f4909d2d9dc082b56b07bc54
4
- data.tar.gz: 17855594b1a780753f5a6e708be8603aaa3497db
3
+ metadata.gz: 0fb5e3f456a1e221345f6217eb95da98592eaf81
4
+ data.tar.gz: d14a5e3ef23821b381e77545c2ed71c90da82e0d
5
5
  SHA512:
6
- metadata.gz: beb24d4020d67d1b9e7e6455d682433adee10848dfdb5e01a940a22d470778120713e05d2d0780173ec4d87aa0399895c97509f145768a2bbe13793e5bddddf0
7
- data.tar.gz: 060d38b1a4fd38c05f4937b447f037bab7f662ea13f9849443a5c479601f8781b149cc450f54602225aae0961e11dba0cd77432f10b4e12feb80cb3b32df654c
6
+ metadata.gz: 5b86f3d52a267b07277419d1dffc6f7ab461a8e524cf44af9de2c96e08b6e75fdb7a9ffac99f0f63534bfbb3994b325a426732cbad4f2740f582008e74996984
7
+ data.tar.gz: a393401ceac0d65543b3b1dfee6975bf82b0f695525f10702ed66abb4a64c481a250c24932effad3460b413eb25483d8eb88f91831fcca3a08397c03ef1525d1
@@ -1,100 +1,95 @@
1
1
 
2
2
  module Rblearn
3
3
 
4
- class CountVectorizer
5
-
4
+ class CountVectorizer
6
5
  # TODO: consider the access controll about all variables
7
- attr_accessor :feature_names, :doc_matrix, :token2index
8
-
9
- # tokenizer: lambda function :: string -> Array<string>
10
- # lowcase: whether if words are lowercases :: bool
11
- # stop_words: list of stop words :: Array<string>
12
- # max_features: limitation of feature size :: Float \in [0, 1]
13
- # TODO: by max_features, zero vectors are sometimes created.
14
- def initialize(tokenizer, lowercase=true, max_features=0.8)
15
- @tokenizer = tokenizer
16
- @lowercase = lowercase
17
-
18
- stop_words = Stopwords::STOP_WORDS
19
- stop_words.map! {|token| token.stem}
20
- stop_words.map! {|token| token.downcase} if @lowercase
21
- @stopwords = stop_words
22
- @max_feature = max_features
23
- end
24
-
25
- # features: Each documents' feature :: Array<String> -> NArray::Int64
26
- def fit_transform(features)
27
- all_vocaburaries = []
28
- word_frequency = Hash.new{|hash, key| hash[key] = 0}
29
- document_frequency = Hash.new{|hash, key| hash[key] = 0}
30
- word_tfidf_score = Hash.new{|hash, key| hash[key] = 0}
31
- document_size = features.size
32
-
33
- features.each do |feature|
34
- token_list = @tokenizer.call(feature)
35
-
36
- # compute tf-value
37
- token_list.each do |token|
38
- token.downcase! if @lowercase
39
- word_frequency[token] += 1
40
- end
41
-
42
- # compute df-value
6
+ attr_accessor :token2index
7
+
8
+ # tokenizer: lambda function :: string -> Array<string>
9
+ # lowcase: whether if words are lowercases :: bool
10
+ # stop_words: list of stop words :: Array<string>
11
+ # max_features: limitation of feature size :: Float \in [0, 1]
12
+ # TODO: by max_features, zero vectors are sometimes created.
13
+ def initialize(tokenizer, lowercase=true, max_features=0.5)
14
+ @tokenizer = tokenizer
15
+ @lowercase = lowercase
16
+
17
+ stop_words = Stopwords::STOP_WORDS + ['-', '--', '(', ')', "\\", "'", '"', '!', '?', ':', ';', '.', ',', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
18
+ stop_words.map! {|token| token.stem}
19
+ stop_words.map! {|token| token.downcase} if @lowercase
20
+ @stopwords = stop_words
21
+ @max_feature = max_features
22
+ end
23
+
24
+ def get_feature_names
25
+ @feature_names
26
+ end
27
+
28
+ # features: Each documents' feature :: Array<String> -> NArray::Int64
29
+ def fit_transform(features)
30
+ all_vocabularies = []
31
+ tf = Hash.new{|hash, token| hash[token] = 0}
32
+ df = Hash.new{|hash, token| hash[token] = 0}
33
+ tfidf = Hash.new{|hash, token| hash[token] = 0}
34
+
35
+ # features: Array<string>
36
+ features.each do |feature|
37
+ feature.downcase! if @lowercase
38
+ token_list = @tokenizer.call(feature).reject{|token| @stopwords.include?(token)}
39
+ all_vocabularies += token_list
40
+
41
+ token_list.each do |token|
42
+ tf[token] += 1
43
+ end
44
+
43
45
  token_list.uniq.each do |token|
44
- document_frequency[token] += 1
45
- all_vocaburaries << token
46
+ df[token] += 1
46
47
  end
47
- end
48
+ end
49
+
50
+ # to get the set of vocabulary
51
+ all_vocabularies.uniq!
52
+
53
+ tf.sort{|(_, v1), (_, v2)| v2 <=> v1}.first(20).each do |token, count|
54
+ tf[token] = 0
55
+ end
56
+
57
+ all_vocabularies.each do |token|
58
+ tfval = Math.log(tf[token])
59
+ idfval = Math.log(all_vocabularies.size.to_f / df[token]) + 1
60
+ tfidf[token] = tfval * idfval
61
+ end
62
+
63
+ tfidf = tfidf.sort{|(_, v1), (_, v2)| v2 <=> v1}
48
64
 
49
- all_vocaburaries.uniq!
50
- all_vocaburaries.each do |token|
51
- tf = 1 + Math.log(word_frequency[token])
52
- idf = Math.log(1+(document_size/document_frequency[token]))
53
- word_tfidf_score[token] = tf * idf
65
+ feature_names = (0...(tfidf.size * @max_feature).to_i).map{|i| tfidf[i][0]}
66
+ token2index = {}
67
+ feature_names.each_with_index do |token, i|
68
+ token2index[token] = i
54
69
  end
55
70
 
56
- word_tfidf_score = word_tfidf_score.sort{|(_, v1), (_, v2)| v2 <=> v1}
57
- feature_names = (0...(word_tfidf_score.size * @max_feature).to_i).map{|i| word_tfidf_score[i][0]}
58
-
59
- token2index = {}
60
- feature_names.each_with_index do |token, i|
61
- token2index[token] = i
62
- end
63
-
64
- doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
65
- features.each_with_index do |feature, doc_id|
66
- tokens = []
67
- @tokenizer.call(feature).each do |token|
68
- token.downcase! if @lowercase
69
- tokens << token unless @stopwords.include?(token)
70
- end
71
-
72
- # BoW representation
73
- counter = Hash.new{|hash, key| hash[key] = 0}
74
- tokens.each do |token|
75
- counter[token] += 1
76
- end
77
-
78
- counter.each do |token, freq|
79
- doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
80
- end
81
- end
82
-
83
- @doc_matrix = doc_matrix
84
- @feature_names = feature_names
85
- @token2index = token2index
86
- return @doc_matrix
87
- end
88
- end
89
-
90
-
91
-
92
- if __FILE__ == $0
93
- cv = CountVectorizer.new(lambda{|s| s.split.map{|token| token.stem}}, 1, 0.8)
94
- features = ['I am train man which automata and philosophy', 'numerical analysis young man', 'logic programmer']
95
- p cv.fit_transform(features)
96
- p cv.feature_names
97
- p cv.token2index
98
- end
71
+ doc_matrix = Numo::Int32.zeros([features.size, feature_names.size])
72
+ features.each_with_index do |feature, doc_id|
73
+ tokens = []
74
+ @tokenizer.call(feature).each do |token|
75
+ token.downcase! if @lowercase
76
+ tokens << token unless @stopwords.include?(token)
77
+ end
78
+
79
+ # BoW representation
80
+ counter = Hash.new{|hash, key| hash[key] = 0}
81
+ tokens.each do |token|
82
+ counter[token] += 1
83
+ end
84
+
85
+ counter.each do |token, freq|
86
+ doc_matrix[doc_id, token2index[token]] = freq if token2index[token]
87
+ end
88
+ end
99
89
 
90
+ @feature_names = feature_names
91
+ @token2index = token2index
92
+ return doc_matrix
93
+ end
94
+ end
100
95
  end
@@ -1,3 +1,3 @@
1
1
  module Rblearn
2
- VERSION = "0.2.2"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rblearn
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - himkt
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-08-01 00:00:00.000000000 Z
11
+ date: 2016-08-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler