word2vec 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,6 @@
1
+ require "word2vec/version"
2
+
3
+ require "word2vec/io"
4
+ require "word2vec/word_vectors"
5
+ require "word2vec/word_clusters"
6
+ require "word2vec/scripts_interface"
@@ -0,0 +1,27 @@
1
+ module Word2Vec
2
+ def self.load(fname, *args, kind: 'auto', **kwargs)
3
+ if kind == 'auto'
4
+ if fname.end_with?('.bin')
5
+ kind = 'bin'
6
+ elsif fname.end_with?('.txt')
7
+ kind = 'txt'
8
+ else
9
+ raise 'Could not identify kind'
10
+ end
11
+ end
12
+
13
+ if kind == 'bin'
14
+ Word2Vec::WordVectors.from_binary(fname, *args, **kwargs)
15
+ elsif kind == 'txt'
16
+ Word2Vec::WordVectors.from_text(fname, *args, **kwargs)
17
+ elsif kind == 'mmap'
18
+ Word2Vec::WordVectors.from_mmap(fname, *args, **kwargs)
19
+ else
20
+ raise 'Unknown kind'
21
+ end
22
+ end
23
+
24
+ def self.load_clusters(fname)
25
+ Word2Vec::WordClusters.from_text(fname)
26
+ end
27
+ end
@@ -0,0 +1,97 @@
1
+ module Word2Vec
2
+ def self.word2vec(train, output, size: 100, window: 5, sample: '1e-3', hs: 0,
3
+ negative: 5, threads: 12, iter_: 5, min_count: 5, alpha: 0.025,
4
+ debug: 2, binary: 1, cbow: 1, save_vocab: nil, read_vocab: nil,
5
+ verbose: false)
6
+ ext = File.expand_path('../../../ext/word2vec', __FILE__)
7
+ command = [File.join(ext, 'word2vec')]
8
+ args = ['-train', '-output', '-size', '-window', '-sample', '-hs',
9
+ '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug',
10
+ '-binary', '-cbow']
11
+ values = [train, output, size, window, sample, hs, negative, threads,
12
+ iter_, min_count, alpha, debug, binary, cbow]
13
+
14
+ args.zip(values).each do |arg, value|
15
+ command << arg
16
+ command << value.to_s
17
+ end
18
+ if save_vocab != nil
19
+ command << '-save-vocab'
20
+ command << save_vocab.to_s
21
+ end
22
+ if read_vocab != nil
23
+ command << '-read-vocab'
24
+ command << read_vocab.to_s
25
+ end
26
+
27
+ run_cmd(command, verbose: verbose)
28
+ end
29
+
30
+ def self.word2clusters(train, output, classes, size: 100, window: 5, sample: '1e-3',
31
+ hs: 0, negative: 5, threads: 12, iter_: 5, min_count: 5,
32
+ alpha: 0.025, debug: 2, binary: 1, cbow: 1,
33
+ save_vocab: nil, read_vocab: nil, verbose: false)
34
+ ext = File.expand_path('../../../ext/word2vec', __FILE__)
35
+ command = [File.join(ext, 'word2vec')]
36
+
37
+ args = ['-train', '-output', '-size', '-window', '-sample', '-hs',
38
+ '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug',
39
+ '-binary', '-cbow', '-classes']
40
+ values = [train, output, size, window, sample, hs, negative, threads,
41
+ iter_, min_count, alpha, debug, binary, cbow, classes]
42
+
43
+ args.zip(values).each do |arg, value|
44
+ command << arg
45
+ command << value.to_s
46
+ end
47
+
48
+ if save_vocab != nil
49
+ command << '-save-vocab'
50
+ command << save_vocab.to_s
51
+ end
52
+ if read_vocab != nil
53
+ command << '-read-vocab'
54
+ command << read_vocab.to_s
55
+ end
56
+
57
+ run_cmd(command, verbose: verbose)
58
+ end
59
+
60
+ def self.word2phrase(train, output, min_count: 5, threshold: 100, debug: 2,
61
+ verbose: false)
62
+ ext = File.expand_path('../../../ext/word2vec', __FILE__)
63
+ command = [File.join(ext, 'word2phrase')]
64
+
65
+ args = ['-train', '-output', '-min-count', '-threshold', '-debug']
66
+ values = [train, output, min_count, threshold, debug]
67
+ args.zip(values).each do |arg, value|
68
+ command << arg
69
+ command << value.to_s
70
+ end
71
+
72
+ run_cmd(command, verbose: verbose)
73
+ end
74
+
75
+ def self.doc2vec(train, output, size: 100, window: 5, sample: '1e-3', hs: 0, negative: 5,
76
+ threads: 12, iter_: 5, min_count: 5, alpha: 0.025, debug: 2, binary: 1,
77
+ cbow: 1,
78
+ save_vocab: nil, read_vocab: nil, verbose: nil)
79
+ raise NotImplementedError
80
+ end
81
+
82
+ def self.run_cmd(command, verbose: false)
83
+ p command.join(' ')
84
+ system(command.join(' '))
85
+
86
+ # TODO: implement it later
87
+ # if verbose
88
+ # while line = stdout.readline
89
+ # $stdout.write(line)
90
+ # if line.include?('ERROR:')
91
+ # raise Exception(line)
92
+ # end
93
+ # $stdout.flush
94
+ # end
95
+ # end
96
+ end
97
+ end
@@ -0,0 +1,9 @@
1
+ require "nmatrix"
2
+
3
+ module Word2Vec
4
+ class WordVectors
5
+ def self.unitvec(vec)
6
+ (NMatrix[*vec] * (1.0 / NMatrix[*vec].norm2)).to_a
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,3 @@
1
+ module Word2Vec
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,36 @@
1
+ require "csv"
2
+
3
+ module Word2Vec
4
+ class WordClusters
5
+ attr_accessor :vocab, :clusters
6
+
7
+ def initialize(vocab:, clusters:)
8
+ self.vocab = vocab
9
+ self.clusters = clusters
10
+ end
11
+
12
+ def ix(word)
13
+ raise NotImplementedError
14
+ end
15
+
16
+ def [](word)
17
+ raise NotImplementedError
18
+ end
19
+
20
+ def get_cluster(word)
21
+ raise NotImplementedError
22
+ end
23
+
24
+ def get_words_on_cluster(cluster)
25
+ indices = clusters.each_with_index.map { |clst, i| i if clst == cluster }.compact
26
+ self.vocab.values_at(*indices)
27
+ end
28
+
29
+ def self.from_text(fname)
30
+ csv = CSV.read(fname, col_sep: " ")
31
+ vocab = csv.transpose[0]
32
+ clusters = csv.transpose[1].map(&:to_i)
33
+ self.new(vocab: vocab, clusters: clusters)
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,182 @@
1
+ require "nmatrix"
2
+
3
+ require "word2vec/utils"
4
+
5
+ module Word2Vec
6
+ class WordVectors
7
+ attr_accessor :vocab, :vectors, :clusters, :vocab_hash
8
+
9
+ def initialize(vocab:, vectors:, clusters: nil)
10
+ @vocab = vocab
11
+ @vectors = vectors
12
+ @clusters = clusters
13
+
14
+ @vocab_hash = {}
15
+ vocab.each_with_index do |word, i|
16
+ @vocab_hash[word] = i
17
+ end
18
+ end
19
+
20
+ def ix(word)
21
+ self.vocab_hash[word]
22
+ end
23
+
24
+ def word(ix)
25
+ self.vocab[ix]
26
+ end
27
+
28
+ def [](word)
29
+ self.get_vector(word)
30
+ end
31
+
32
+ def include?(word)
33
+ raise NotImplementedError
34
+ end
35
+
36
+ def get_vector(word)
37
+ idx = self.ix(word)
38
+ self.vectors[idx]
39
+ end
40
+
41
+ def cosine(word, n: 10)
42
+ metrics = NMatrix[*self.vectors, dtype: :float32].dot(NMatrix[self[word], dtype: :float32].transpose)
43
+ best = metrics.sorted_indices.reverse[1..n]
44
+ best_metrics = metrics.to_a.values_at(*best).flatten
45
+ [best, best_metrics]
46
+ end
47
+
48
+ def analogy(pos:, neg:, n: 10)
49
+ exclude = pos + neg
50
+ pos = pos.map { |word| [word, 1.0] }
51
+ neg = neg.map { |word| [word, -1.0] }
52
+
53
+ mean = []
54
+ (pos + neg).each do |word, direction|
55
+ mean << (NMatrix[*self[word], dtype: :float32] * direction).to_a
56
+ end
57
+ mean = NMatrix[*mean, dtype: :float32].mean
58
+
59
+ metrics = NMatrix[*self.vectors, dtype: :float32].dot(mean.transpose)
60
+ best = metrics.sorted_indices.reverse[0...(n + exclude.size)]
61
+
62
+ exclude_idx = []
63
+ exclude.each do |word|
64
+ if best.include?(self.ix(word))
65
+ exclude_idx << best.each_index.select { |i| best[i] == self.ix(word) }
66
+ end
67
+ end
68
+ exclude_idx.flatten.uniq.each do |index|
69
+ best.delete_at(index)
70
+ end
71
+ new_best = best
72
+ best_metrics = metrics.to_a.flatten.values_at(*new_best)
73
+ [new_best[0...n], best_metrics[0...n]]
74
+ end
75
+
76
+ def generate_response(indices, metrics, clusters: true)
77
+ if self.clusters && clusters
78
+ self.vocab.values_at(*indices)
79
+ .zip(metrics, self.clusters.clusters.values_at(*indices))
80
+ else
81
+ self.vocab.values_at(*indices).zip(metrics)
82
+ end
83
+ end
84
+
85
+ def to_mmap(fname)
86
+ raise NotImplementedError
87
+ end
88
+
89
+ def self.from_binary(fname, vocab_unicode_size: 78, desired_vocab: nil, encoding: "utf-8")
90
+ vocab = nil
91
+ vectors = nil
92
+
93
+ File.open(fname, 'rb') do |fin|
94
+ header = fin.readline
95
+ vocab_size, vector_size = header.split.map(&:to_i)
96
+
97
+ # TODO: replace numpy with nmatrix
98
+ # little-endian (<), Unicode (U), 78 characters == 2496 bytes (78)
99
+ # vocab = numpy.empty(vocab_size, dtype = '<U%s' % vocab_unicode_size)
100
+ # vectors = numpy.empty([vocab_size, vector_size], dtype = np.float)
101
+ # binary_len = numpy.dtype(np.float32).itemsize * vector_size
102
+
103
+ vocab = NMatrix.new([vocab_size], "", dtype: :object).to_a
104
+ vectors = NMatrix.random([vocab_size, vector_size], dtype: :float64).to_a
105
+ binary_len = 4 * vector_size # need to calculate from a data type
106
+
107
+ vocab_size.times do |i|
108
+ word = ''
109
+ while true
110
+ ch = fin.read(1)
111
+ if ch == ' '
112
+ break
113
+ end
114
+ word += ch
115
+ end
116
+ inklude = desired_vocab == nil || desired_vocab.include?(word)
117
+ if inklude
118
+ vocab[i] = word.force_encoding(encoding)
119
+ end
120
+
121
+ # read vector
122
+ vector = NMatrix[*fin.read(binary_len).unpack('f*'), dtype: :float32].to_a
123
+ if inklude
124
+ vectors[i] = unitvec(vector)
125
+ end
126
+ fin.read(1) # newline
127
+ end
128
+
129
+ if desired_vocab != nil
130
+ indices = vocab.each_with_index.map { |word, i| i if vocab != nil }.compact
131
+ vectors = vectors.values_at(*indices)
132
+ vocab = vocab.values_at(*indices)
133
+ end
134
+ end
135
+
136
+ self.new(vocab: vocab, vectors: vectors)
137
+ end
138
+
139
+ def self.from_text(fname, vocab_unicode_size: 78, desired_vocab: nil, encoding: "utf-8")
140
+ vocab = nil
141
+ vectors = nil
142
+
143
+ File.open(fname, 'rb') do |fin|
144
+ header = fin.readline
145
+ vocab_size, vector_size = header.split.map(&:to_i)
146
+
147
+ # TODO: replace numpy with nmatrix
148
+ # little-endian (<), Unicode (U), 78 characters == 2496 bytes (78)
149
+ # vocab = numpy.empty(vocab_size, dtype = '<U%s' % vocab_unicode_size)
150
+ # vectors = numpy.empty([vocab_size, vector_size], dtype = np.float)
151
+ # binary_len = numpy.dtype(np.float32).itemsize * vector_size
152
+
153
+ vocab = NMatrix.new([vocab_size], "", dtype: :object).to_a
154
+ vectors = NMatrix.random([vocab_size, vector_size], dtype: :float64).to_a
155
+
156
+ fin.each_line.with_index do |line, i|
157
+ line = line.force_encoding(encoding).strip
158
+ parts = line.split(" ")
159
+ word = parts[0]
160
+ inklude = desired_vocab == nil || desired_vocab.include?(word)
161
+ if inklude
162
+ vector = parts[1..-1].map(&:to_f)
163
+ vocab[i] = word
164
+ vectors[i] = unitvec(vector)
165
+ end
166
+ end
167
+
168
+ if desired_vocab != nil
169
+ indices = vocab.each_with_index.map { |word, i| i if vocab != nil }.compact
170
+ vectors = vectors.values_at(*indices)
171
+ vocab = vocab.values_at(*indices)
172
+ end
173
+ end
174
+
175
+ self.new(vocab: vocab, vectors: vectors)
176
+ end
177
+
178
+ def self.from_mmap(fname)
179
+ raise NotImplementedError
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'word2vec/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "word2vec"
8
+ spec.version = Word2Vec::VERSION
9
+ spec.authors = ["cafedomancer"]
10
+ spec.email = ["cafedomancer@gmail.com"]
11
+
12
+ spec.summary = %q{A simple wrapper for word2vec.}
13
+ spec.description = %q{A simple wrapper for word2vec.}
14
+ spec.homepage = "https://github.com/cafedomancer/word2vec"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+ spec.extensions = ["ext/word2vec/extconf.rb"]
22
+
23
+ spec.add_runtime_dependency "nmatrix", "~> 0.2.3"
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.12"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "rake-compiler", "~> 1.0"
28
+ spec.add_development_dependency "rspec", "~> 3.0"
29
+ end
metadata ADDED
@@ -0,0 +1,151 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: word2vec
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - cafedomancer
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-11-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nmatrix
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.3
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.12'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.12'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake-compiler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ description: A simple wrapper for word2vec.
84
+ email:
85
+ - cafedomancer@gmail.com
86
+ executables: []
87
+ extensions:
88
+ - ext/word2vec/extconf.rb
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".gitignore"
92
+ - ".rspec"
93
+ - ".travis.yml"
94
+ - CODE_OF_CONDUCT.md
95
+ - Gemfile
96
+ - LICENSE.txt
97
+ - README.md
98
+ - Rakefile
99
+ - bin/console
100
+ - bin/setup
101
+ - ext/word2vec/LICENSE
102
+ - ext/word2vec/README.txt
103
+ - ext/word2vec/compute-accuracy.c
104
+ - ext/word2vec/demo-analogy.sh
105
+ - ext/word2vec/demo-classes.sh
106
+ - ext/word2vec/demo-phrase-accuracy.sh
107
+ - ext/word2vec/demo-phrases.sh
108
+ - ext/word2vec/demo-train-big-model-v1.sh
109
+ - ext/word2vec/demo-word-accuracy.sh
110
+ - ext/word2vec/demo-word.sh
111
+ - ext/word2vec/distance.c
112
+ - ext/word2vec/extconf.rb
113
+ - ext/word2vec/makefile
114
+ - ext/word2vec/questions-phrases.txt
115
+ - ext/word2vec/questions-words.txt
116
+ - ext/word2vec/word-analogy.c
117
+ - ext/word2vec/word2phrase.c
118
+ - ext/word2vec/word2vec.c
119
+ - lib/word2vec.rb
120
+ - lib/word2vec/io.rb
121
+ - lib/word2vec/scripts_interface.rb
122
+ - lib/word2vec/utils.rb
123
+ - lib/word2vec/version.rb
124
+ - lib/word2vec/word_clusters.rb
125
+ - lib/word2vec/word_vectors.rb
126
+ - word2vec.gemspec
127
+ homepage: https://github.com/cafedomancer/word2vec
128
+ licenses:
129
+ - MIT
130
+ metadata: {}
131
+ post_install_message:
132
+ rdoc_options: []
133
+ require_paths:
134
+ - lib
135
+ required_ruby_version: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ version: '0'
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: '0'
145
+ requirements: []
146
+ rubyforge_project:
147
+ rubygems_version: 2.6.8
148
+ signing_key:
149
+ specification_version: 4
150
+ summary: A simple wrapper for word2vec.
151
+ test_files: []