word2vec 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ require "word2vec/version"
2
+
3
+ require "word2vec/io"
4
+ require "word2vec/word_vectors"
5
+ require "word2vec/word_clusters"
6
+ require "word2vec/scripts_interface"
@@ -0,0 +1,27 @@
1
+ module Word2Vec
2
+ def self.load(fname, *args, kind: 'auto', **kwargs)
3
+ if kind == 'auto'
4
+ if fname.end_with?('.bin')
5
+ kind = 'bin'
6
+ elsif fname.end_with?('.txt')
7
+ kind = 'txt'
8
+ else
9
+ raise 'Could not identify kind'
10
+ end
11
+ end
12
+
13
+ if kind == 'bin'
14
+ Word2Vec::WordVectors.from_binary(fname, *args, **kwargs)
15
+ elsif kind == 'txt'
16
+ Word2Vec::WordVectors.from_text(fname, *args, **kwargs)
17
+ elsif kind == 'mmap'
18
+ Word2Vec::WordVectors.from_mmap(fname, *args, **kwargs)
19
+ else
20
+ raise 'Unknown kind'
21
+ end
22
+ end
23
+
24
+ def self.load_clusters(fname)
25
+ Word2Vec::WordClusters.from_text(fname)
26
+ end
27
+ end
@@ -0,0 +1,97 @@
1
+ module Word2Vec
2
+ def self.word2vec(train, output, size: 100, window: 5, sample: '1e-3', hs: 0,
3
+ negative: 5, threads: 12, iter_: 5, min_count: 5, alpha: 0.025,
4
+ debug: 2, binary: 1, cbow: 1, save_vocab: nil, read_vocab: nil,
5
+ verbose: false)
6
+ ext = File.expand_path('../../../ext/word2vec', __FILE__)
7
+ command = [File.join(ext, 'word2vec')]
8
+ args = ['-train', '-output', '-size', '-window', '-sample', '-hs',
9
+ '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug',
10
+ '-binary', '-cbow']
11
+ values = [train, output, size, window, sample, hs, negative, threads,
12
+ iter_, min_count, alpha, debug, binary, cbow]
13
+
14
+ args.zip(values).each do |arg, value|
15
+ command << arg
16
+ command << value.to_s
17
+ end
18
+ if save_vocab != nil
19
+ command << '-save-vocab'
20
+ command << save_vocab.to_s
21
+ end
22
+ if read_vocab != nil
23
+ command << '-read-vocab'
24
+ command << read_vocab.to_s
25
+ end
26
+
27
+ run_cmd(command, verbose: verbose)
28
+ end
29
+
30
+ def self.word2clusters(train, output, classes, size: 100, window: 5, sample: '1e-3',
31
+ hs: 0, negative: 5, threads: 12, iter_: 5, min_count: 5,
32
+ alpha: 0.025, debug: 2, binary: 1, cbow: 1,
33
+ save_vocab: nil, read_vocab: nil, verbose: false)
34
+ ext = File.expand_path('../../../ext/word2vec', __FILE__)
35
+ command = [File.join(ext, 'word2vec')]
36
+
37
+ args = ['-train', '-output', '-size', '-window', '-sample', '-hs',
38
+ '-negative', '-threads', '-iter', '-min-count', '-alpha', '-debug',
39
+ '-binary', '-cbow', '-classes']
40
+ values = [train, output, size, window, sample, hs, negative, threads,
41
+ iter_, min_count, alpha, debug, binary, cbow, classes]
42
+
43
+ args.zip(values).each do |arg, value|
44
+ command << arg
45
+ command << value.to_s
46
+ end
47
+
48
+ if save_vocab != nil
49
+ command << '-save-vocab'
50
+ command << save_vocab.to_s
51
+ end
52
+ if read_vocab != nil
53
+ command << '-read-vocab'
54
+ command << read_vocab.to_s
55
+ end
56
+
57
+ run_cmd(command, verbose: verbose)
58
+ end
59
+
60
+ def self.word2phrase(train, output, min_count: 5, threshold: 100, debug: 2,
61
+ verbose: false)
62
+ ext = File.expand_path('../../../ext/word2vec', __FILE__)
63
+ command = [File.join(ext, 'word2phrase')]
64
+
65
+ args = ['-train', '-output', '-min-count', '-threshold', '-debug']
66
+ values = [train, output, min_count, threshold, debug]
67
+ args.zip(values).each do |arg, value|
68
+ command << arg
69
+ command << value.to_s
70
+ end
71
+
72
+ run_cmd(command, verbose: verbose)
73
+ end
74
+
75
+ def self.doc2vec(train, output, size: 100, window: 5, sample: '1e-3', hs: 0, negative: 5,
76
+ threads: 12, iter_: 5, min_count: 5, alpha: 0.025, debug: 2, binary: 1,
77
+ cbow: 1,
78
+ save_vocab: nil, read_vocab: nil, verbose: nil)
79
+ raise NotImplementedError
80
+ end
81
+
82
+ def self.run_cmd(command, verbose: false)
83
+ p command.join(' ')
84
+ system(command.join(' '))
85
+
86
+ # TODO: implement it later
87
+ # if verbose
88
+ # while line = stdout.readline
89
+ # $stdout.write(line)
90
+ # if line.include?('ERROR:')
91
+ # raise Exception(line)
92
+ # end
93
+ # $stdout.flush
94
+ # end
95
+ # end
96
+ end
97
+ end
@@ -0,0 +1,9 @@
1
+ require "nmatrix"
2
+
3
+ module Word2Vec
4
+ class WordVectors
5
+ def self.unitvec(vec)
6
+ (NMatrix[*vec] * (1.0 / NMatrix[*vec].norm2)).to_a
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,3 @@
1
+ module Word2Vec
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,36 @@
1
+ require "csv"
2
+
3
+ module Word2Vec
4
+ class WordClusters
5
+ attr_accessor :vocab, :clusters
6
+
7
+ def initialize(vocab:, clusters:)
8
+ self.vocab = vocab
9
+ self.clusters = clusters
10
+ end
11
+
12
+ def ix(word)
13
+ raise NotImplementedError
14
+ end
15
+
16
+ def [](word)
17
+ raise NotImplementedError
18
+ end
19
+
20
+ def get_cluster(word)
21
+ raise NotImplementedError
22
+ end
23
+
24
+ def get_words_on_cluster(cluster)
25
+ indices = clusters.each_with_index.map { |clst, i| i if clst == cluster }.compact
26
+ self.vocab.values_at(*indices)
27
+ end
28
+
29
+ def self.from_text(fname)
30
+ csv = CSV.read(fname, col_sep: " ")
31
+ vocab = csv.transpose[0]
32
+ clusters = csv.transpose[1].map(&:to_i)
33
+ self.new(vocab: vocab, clusters: clusters)
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,182 @@
1
+ require "nmatrix"
2
+
3
+ require "word2vec/utils"
4
+
5
+ module Word2Vec
6
+ class WordVectors
7
+ attr_accessor :vocab, :vectors, :clusters, :vocab_hash
8
+
9
+ def initialize(vocab:, vectors:, clusters: nil)
10
+ @vocab = vocab
11
+ @vectors = vectors
12
+ @clusters = clusters
13
+
14
+ @vocab_hash = {}
15
+ vocab.each_with_index do |word, i|
16
+ @vocab_hash[word] = i
17
+ end
18
+ end
19
+
20
+ def ix(word)
21
+ self.vocab_hash[word]
22
+ end
23
+
24
+ def word(ix)
25
+ self.vocab[ix]
26
+ end
27
+
28
+ def [](word)
29
+ self.get_vector(word)
30
+ end
31
+
32
+ def include?(word)
33
+ raise NotImplementedError
34
+ end
35
+
36
+ def get_vector(word)
37
+ idx = self.ix(word)
38
+ self.vectors[idx]
39
+ end
40
+
41
+ def cosine(word, n: 10)
42
+ metrics = NMatrix[*self.vectors, dtype: :float32].dot(NMatrix[self[word], dtype: :float32].transpose)
43
+ best = metrics.sorted_indices.reverse[1..n]
44
+ best_metrics = metrics.to_a.values_at(*best).flatten
45
+ [best, best_metrics]
46
+ end
47
+
48
+ def analogy(pos:, neg:, n: 10)
49
+ exclude = pos + neg
50
+ pos = pos.map { |word| [word, 1.0] }
51
+ neg = neg.map { |word| [word, -1.0] }
52
+
53
+ mean = []
54
+ (pos + neg).each do |word, direction|
55
+ mean << (NMatrix[*self[word], dtype: :float32] * direction).to_a
56
+ end
57
+ mean = NMatrix[*mean, dtype: :float32].mean
58
+
59
+ metrics = NMatrix[*self.vectors, dtype: :float32].dot(mean.transpose)
60
+ best = metrics.sorted_indices.reverse[0...(n + exclude.size)]
61
+
62
+ exclude_idx = []
63
+ exclude.each do |word|
64
+ if best.include?(self.ix(word))
65
+ exclude_idx << best.each_index.select { |i| best[i] == self.ix(word) }
66
+ end
67
+ end
68
+ exclude_idx.flatten.uniq.each do |index|
69
+ best.delete_at(index)
70
+ end
71
+ new_best = best
72
+ best_metrics = metrics.to_a.flatten.values_at(*new_best)
73
+ [new_best[0...n], best_metrics[0...n]]
74
+ end
75
+
76
+ def generate_response(indices, metrics, clusters: true)
77
+ if self.clusters && clusters
78
+ self.vocab.values_at(*indices)
79
+ .zip(metrics, self.clusters.clusters.values_at(*indices))
80
+ else
81
+ self.vocab.values_at(*indices).zip(metrics)
82
+ end
83
+ end
84
+
85
+ def to_mmap(fname)
86
+ raise NotImplementedError
87
+ end
88
+
89
+ def self.from_binary(fname, vocab_unicode_size: 78, desired_vocab: nil, encoding: "utf-8")
90
+ vocab = nil
91
+ vectors = nil
92
+
93
+ File.open(fname, 'rb') do |fin|
94
+ header = fin.readline
95
+ vocab_size, vector_size = header.split.map(&:to_i)
96
+
97
+ # TODO: replace numpy with nmatrix
98
+ # little-endian (<), Unicode (U), 78 characters == 2496 bytes (78)
99
+ # vocab = numpy.empty(vocab_size, dtype = '<U%s' % vocab_unicode_size)
100
+ # vectors = numpy.empty([vocab_size, vector_size], dtype = np.float)
101
+ # binary_len = numpy.dtype(np.float32).itemsize * vector_size
102
+
103
+ vocab = NMatrix.new([vocab_size], "", dtype: :object).to_a
104
+ vectors = NMatrix.random([vocab_size, vector_size], dtype: :float64).to_a
105
+ binary_len = 4 * vector_size # need to calculate from a data type
106
+
107
+ vocab_size.times do |i|
108
+ word = ''
109
+ while true
110
+ ch = fin.read(1)
111
+ if ch == ' '
112
+ break
113
+ end
114
+ word += ch
115
+ end
116
+ inklude = desired_vocab == nil || desired_vocab.include?(word)
117
+ if inklude
118
+ vocab[i] = word.force_encoding(encoding)
119
+ end
120
+
121
+ # read vector
122
+ vector = NMatrix[*fin.read(binary_len).unpack('f*'), dtype: :float32].to_a
123
+ if inklude
124
+ vectors[i] = unitvec(vector)
125
+ end
126
+ fin.read(1) # newline
127
+ end
128
+
129
+ if desired_vocab != nil
130
+ indices = vocab.each_with_index.map { |word, i| i if vocab != nil }.compact
131
+ vectors = vectors.values_at(*indices)
132
+ vocab = vocab.values_at(*indices)
133
+ end
134
+ end
135
+
136
+ self.new(vocab: vocab, vectors: vectors)
137
+ end
138
+
139
+ def self.from_text(fname, vocab_unicode_size: 78, desired_vocab: nil, encoding: "utf-8")
140
+ vocab = nil
141
+ vectors = nil
142
+
143
+ File.open(fname, 'rb') do |fin|
144
+ header = fin.readline
145
+ vocab_size, vector_size = header.split.map(&:to_i)
146
+
147
+ # TODO: replace numpy with nmatrix
148
+ # little-endian (<), Unicode (U), 78 characters == 2496 bytes (78)
149
+ # vocab = numpy.empty(vocab_size, dtype = '<U%s' % vocab_unicode_size)
150
+ # vectors = numpy.empty([vocab_size, vector_size], dtype = np.float)
151
+ # binary_len = numpy.dtype(np.float32).itemsize * vector_size
152
+
153
+ vocab = NMatrix.new([vocab_size], "", dtype: :object).to_a
154
+ vectors = NMatrix.random([vocab_size, vector_size], dtype: :float64).to_a
155
+
156
+ fin.each_line.with_index do |line, i|
157
+ line = line.force_encoding(encoding).strip
158
+ parts = line.split(" ")
159
+ word = parts[0]
160
+ inklude = desired_vocab == nil || desired_vocab.include?(word)
161
+ if inklude
162
+ vector = parts[1..-1].map(&:to_f)
163
+ vocab[i] = word
164
+ vectors[i] = unitvec(vector)
165
+ end
166
+ end
167
+
168
+ if desired_vocab != nil
169
+ indices = vocab.each_with_index.map { |word, i| i if vocab != nil }.compact
170
+ vectors = vectors.values_at(*indices)
171
+ vocab = vocab.values_at(*indices)
172
+ end
173
+ end
174
+
175
+ self.new(vocab: vocab, vectors: vectors)
176
+ end
177
+
178
+ def self.from_mmap(fname)
179
+ raise NotImplementedError
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'word2vec/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "word2vec"
8
+ spec.version = Word2Vec::VERSION
9
+ spec.authors = ["cafedomancer"]
10
+ spec.email = ["cafedomancer@gmail.com"]
11
+
12
+ spec.summary = %q{A simple wrapper for word2vec.}
13
+ spec.description = %q{A simple wrapper for word2vec.}
14
+ spec.homepage = "https://github.com/cafedomancer/word2vec"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+ spec.extensions = ["ext/word2vec/extconf.rb"]
22
+
23
+ spec.add_runtime_dependency "nmatrix", "~> 0.2.3"
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.12"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "rake-compiler", "~> 1.0"
28
+ spec.add_development_dependency "rspec", "~> 3.0"
29
+ end
metadata ADDED
@@ -0,0 +1,151 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: word2vec
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - cafedomancer
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-11-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nmatrix
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.3
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.12'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.12'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake-compiler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ description: A simple wrapper for word2vec.
84
+ email:
85
+ - cafedomancer@gmail.com
86
+ executables: []
87
+ extensions:
88
+ - ext/word2vec/extconf.rb
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".gitignore"
92
+ - ".rspec"
93
+ - ".travis.yml"
94
+ - CODE_OF_CONDUCT.md
95
+ - Gemfile
96
+ - LICENSE.txt
97
+ - README.md
98
+ - Rakefile
99
+ - bin/console
100
+ - bin/setup
101
+ - ext/word2vec/LICENSE
102
+ - ext/word2vec/README.txt
103
+ - ext/word2vec/compute-accuracy.c
104
+ - ext/word2vec/demo-analogy.sh
105
+ - ext/word2vec/demo-classes.sh
106
+ - ext/word2vec/demo-phrase-accuracy.sh
107
+ - ext/word2vec/demo-phrases.sh
108
+ - ext/word2vec/demo-train-big-model-v1.sh
109
+ - ext/word2vec/demo-word-accuracy.sh
110
+ - ext/word2vec/demo-word.sh
111
+ - ext/word2vec/distance.c
112
+ - ext/word2vec/extconf.rb
113
+ - ext/word2vec/makefile
114
+ - ext/word2vec/questions-phrases.txt
115
+ - ext/word2vec/questions-words.txt
116
+ - ext/word2vec/word-analogy.c
117
+ - ext/word2vec/word2phrase.c
118
+ - ext/word2vec/word2vec.c
119
+ - lib/word2vec.rb
120
+ - lib/word2vec/io.rb
121
+ - lib/word2vec/scripts_interface.rb
122
+ - lib/word2vec/utils.rb
123
+ - lib/word2vec/version.rb
124
+ - lib/word2vec/word_clusters.rb
125
+ - lib/word2vec/word_vectors.rb
126
+ - word2vec.gemspec
127
+ homepage: https://github.com/cafedomancer/word2vec
128
+ licenses:
129
+ - MIT
130
+ metadata: {}
131
+ post_install_message:
132
+ rdoc_options: []
133
+ require_paths:
134
+ - lib
135
+ required_ruby_version: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ version: '0'
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: '0'
145
+ requirements: []
146
+ rubyforge_project:
147
+ rubygems_version: 2.6.8
148
+ signing_key:
149
+ specification_version: 4
150
+ summary: A simple wrapper for word2vec.
151
+ test_files: []