rsemantic 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/{README.txt → README.md} +19 -10
- data/lib/semantic.rb +8 -5
- data/lib/semantic/compare.rb +4 -1
- data/lib/semantic/corpus.rb +61 -0
- data/lib/semantic/document.rb +39 -0
- data/lib/semantic/matrix_transformer.rb +4 -5
- data/lib/semantic/parser.rb +22 -10
- data/lib/semantic/search.rb +22 -16
- data/lib/semantic/search_result.rb +16 -0
- data/lib/semantic/transform/lsa_transform.rb +47 -22
- data/lib/semantic/transform/tf_idf_transform.rb +12 -23
- data/lib/semantic/vector_space/builder.rb +29 -22
- data/lib/semantic/vector_space/model.rb +14 -13
- data/lib/semantic/version.rb +1 -1
- data/lib/tasks/rspec.rake +13 -0
- metadata +75 -107
- data/Manifest.txt +0 -38
- data/Rakefile +0 -9
- data/config/hoe.rb +0 -69
- data/config/requirements.rb +0 -15
- data/gem_tasks/deployment.rake +0 -34
- data/gem_tasks/environment.rake +0 -7
- data/gem_tasks/examples.rake +0 -29
- data/gem_tasks/fix_cr_lf.rake +0 -10
- data/gem_tasks/gemspec.rake +0 -6
- data/gem_tasks/rspec.rake +0 -33
- data/gem_tasks/website.rake +0 -17
- data/rsemantic.gemspec +0 -41
- data/spec/semantic/compare_spec.rb +0 -16
- data/spec/semantic/matrix_transformer_spec.rb +0 -51
- data/spec/semantic/parser_spec.rb +0 -34
- data/spec/semantic/search_spec.rb +0 -129
- data/spec/semantic/transform/lsa_transform_spec.rb +0 -59
- data/spec/semantic/transform/tf_idf_transform_spec.rb +0 -35
- data/spec/semantic/vector_space/builder_spec.rb +0 -44
- data/spec/semantic/vector_space/model_spec.rb +0 -22
- data/spec/spec.opts +0 -2
- data/spec/spec_helper.rb +0 -7
@@ -3,38 +3,27 @@ module Semantic
|
|
3
3
|
class TFIDF
|
4
4
|
|
5
5
|
@@number_of_documents_with_term = []
|
6
|
-
|
7
|
-
|
8
|
-
number_of_documents = matrix.num_columns
|
6
|
+
def self.transform!(matrix)
|
7
|
+
number_of_documents = matrix.size2
|
9
8
|
@@number_of_documents_with_term = []
|
10
9
|
|
11
|
-
matrix.
|
12
|
-
document_term_total = document.
|
10
|
+
matrix.transpose.enum_for(:each_row).with_index do |document, column_index|
|
11
|
+
document_term_total = document.sum
|
12
|
+
|
13
|
+
document.enum_for(:each).with_index do |term_weight, row_index|
|
14
|
+
unless term_weight == 0.0
|
15
|
+
inverse_document_frequency = GSL::Sf.log((number_of_documents /
|
16
|
+
number_of_documents_with_term(row_index, matrix).to_f).abs)
|
17
|
+
term_frequency = (term_weight / document_term_total)
|
13
18
|
|
14
|
-
|
15
|
-
unless term_weight.to_f == 0.0
|
16
|
-
matrix[row_index, column_index] = (term_weight / document_term_total) *
|
17
|
-
Math.log((number_of_documents / number_of_documents_with_term(row_index, matrix).to_f).abs)
|
19
|
+
matrix[row_index, column_index] = term_frequency * inverse_document_frequency
|
18
20
|
end
|
19
21
|
end
|
20
22
|
end
|
21
|
-
matrix
|
22
23
|
end
|
23
24
|
|
24
25
|
def self.number_of_documents_with_term(row_index, matrix)
|
25
|
-
|
26
|
-
|
27
|
-
term_document_occurences = 0
|
28
|
-
|
29
|
-
rows,cols = matrix.dimensions
|
30
|
-
|
31
|
-
for n in (0...cols)
|
32
|
-
if matrix[row_index, n] > 0 #Term appears in document
|
33
|
-
term_document_occurences += 1
|
34
|
-
end
|
35
|
-
end
|
36
|
-
@@number_of_documents_with_term[row_index] = term_document_occurences
|
37
|
-
@@number_of_documents_with_term[row_index]
|
26
|
+
@@number_of_documents_with_term[row_index] ||= matrix.row(row_index).where.size
|
38
27
|
end
|
39
28
|
|
40
29
|
end
|
@@ -1,22 +1,26 @@
|
|
1
1
|
module Semantic
|
2
2
|
module VectorSpace
|
3
|
-
#A algebraic model for representing text documents as vectors of identifiers.
|
4
|
-
#A document is represented as a vector. Each dimension of the vector corresponds to a
|
5
|
-
#separate term. If a term occurs in the document, then the value in the vector is non-zero.
|
3
|
+
# A algebraic model for representing text documents as vectors of identifiers.
|
4
|
+
# A document is represented as a vector. Each dimension of the vector corresponds to a
|
5
|
+
# separate term. If a term occurs in the document, then the value in the vector is non-zero.
|
6
6
|
class Builder
|
7
7
|
|
8
|
-
def initialize(options={})
|
9
|
-
@parser = Parser.new
|
10
|
-
@options = options
|
8
|
+
def initialize(options = {})
|
9
|
+
@parser = Parser.new(:filter_stop_words => options[:filter_stop_words])
|
11
10
|
@parsed_document_cache = []
|
12
11
|
end
|
13
12
|
|
14
13
|
def build_document_matrix(documents)
|
15
14
|
@vector_keyword_index = build_vector_keyword_index(documents)
|
16
|
-
|
15
|
+
|
17
16
|
document_vectors = documents.enum_for(:each_with_index).map{|document,document_id| build_vector(document, document_id)}
|
18
|
-
|
19
|
-
|
17
|
+
|
18
|
+
n = document_vectors.size
|
19
|
+
m = document_vectors.first.size
|
20
|
+
|
21
|
+
# TODO check where else we use document_vectors and if we can directly use column based ones
|
22
|
+
document_matrix = GSL::Matrix.alloc(*document_vectors.map {|v| v.transpose})
|
23
|
+
|
20
24
|
Model.new(document_matrix, @vector_keyword_index)
|
21
25
|
end
|
22
26
|
|
@@ -28,7 +32,7 @@ module Semantic
|
|
28
32
|
def build_vector_keyword_index(documents)
|
29
33
|
parse_and_cache(documents)
|
30
34
|
vocabulary_list = find_unique_vocabulary
|
31
|
-
|
35
|
+
map_vocabulary_to_vector_positions(vocabulary_list)
|
32
36
|
end
|
33
37
|
|
34
38
|
def parse_and_cache(documents)
|
@@ -38,32 +42,35 @@ module Semantic
|
|
38
42
|
end
|
39
43
|
|
40
44
|
def find_unique_vocabulary
|
41
|
-
|
42
|
-
vocabulary_list.uniq
|
45
|
+
@parsed_document_cache.flatten.reverse.uniq
|
43
46
|
end
|
44
|
-
|
47
|
+
|
45
48
|
def map_vocabulary_to_vector_positions(vocabulary_list)
|
46
49
|
vector_index={}
|
47
50
|
column = 0
|
48
|
-
|
51
|
+
vocabulary_list.each do |word|
|
49
52
|
vector_index[word] = column
|
50
53
|
column += 1
|
51
54
|
end
|
52
55
|
vector_index
|
53
|
-
end
|
54
|
-
|
55
|
-
def build_vector(word_string, document_id=nil)
|
56
|
+
end
|
57
|
+
|
58
|
+
def build_vector(word_string, document_id = nil)
|
56
59
|
if document_id.nil?
|
57
60
|
word_list = @parser.tokenise_and_filter(word_string)
|
58
61
|
else
|
59
62
|
word_list = @parsed_document_cache[document_id]
|
60
63
|
end
|
61
|
-
|
62
|
-
vector =
|
63
|
-
word_list.each { |word|
|
64
|
+
|
65
|
+
vector = GSL::Vector.alloc(@vector_keyword_index.length)
|
66
|
+
word_list.each { |word|
|
67
|
+
if @vector_keyword_index.has_key?(word)
|
68
|
+
vector[@vector_keyword_index[word]] += 1
|
69
|
+
end
|
70
|
+
}
|
71
|
+
|
64
72
|
vector
|
65
73
|
end
|
66
|
-
|
67
74
|
end
|
68
75
|
end
|
69
|
-
end
|
76
|
+
end
|
@@ -1,39 +1,40 @@
|
|
1
|
-
require '
|
1
|
+
require 'gsl'
|
2
2
|
require 'delegate'
|
3
3
|
require 'stringio'
|
4
4
|
|
5
5
|
module Semantic
|
6
6
|
module VectorSpace
|
7
7
|
|
8
|
-
class Model < DelegateClass(::
|
9
|
-
|
8
|
+
class Model < DelegateClass(::GSL::Matrix)
|
10
9
|
def initialize(matrix, keywords)
|
11
10
|
@keywords = keywords || {}
|
12
11
|
@_dc_obj = matrix
|
13
12
|
super(matrix)
|
14
13
|
end
|
15
|
-
|
14
|
+
|
16
15
|
def matrix=(matrix)
|
17
16
|
@_dc_obj = matrix
|
18
17
|
end
|
19
|
-
|
18
|
+
|
20
19
|
def matrix
|
21
20
|
@_dc_obj
|
22
21
|
end
|
23
|
-
|
22
|
+
|
24
23
|
def to_s
|
25
24
|
out = StringIO.new
|
26
25
|
out.print " " * 9
|
27
|
-
|
28
|
-
matrix.
|
29
|
-
out.print " D#{id+1} "
|
26
|
+
|
27
|
+
matrix.size2.times do |id|
|
28
|
+
out.print " D#{id+1} "
|
30
29
|
end
|
31
30
|
out.puts
|
32
31
|
|
33
|
-
matrix.
|
34
|
-
|
32
|
+
matrix.to_a.each_with_index do |terms, index|
|
33
|
+
# TODO fix for 1.8.7
|
34
|
+
out.print "#{@keywords.key(index).ljust(6)}" if @keywords.has_value?(index)
|
35
35
|
out.print "[ "
|
36
|
-
|
36
|
+
|
37
|
+
terms.each do |document|
|
37
38
|
out.print "%+0.2f " % document
|
38
39
|
end
|
39
40
|
out.print "]"
|
@@ -41,7 +42,7 @@ module Semantic
|
|
41
42
|
end
|
42
43
|
out.string
|
43
44
|
end
|
44
|
-
|
45
|
+
|
45
46
|
end
|
46
47
|
end
|
47
48
|
end
|
data/lib/semantic/version.rb
CHANGED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
|
3
|
+
desc 'Default: run specs.'
|
4
|
+
task :default => :spec
|
5
|
+
|
6
|
+
desc "Run specs"
|
7
|
+
RSpec::Core::RakeTask.new
|
8
|
+
|
9
|
+
desc "Generate code coverage"
|
10
|
+
RSpec::Core::RakeTask.new(:coverage) do |t|
|
11
|
+
t.rcov = true
|
12
|
+
t.rcov_opts = ['--exclude', 'spec']
|
13
|
+
end
|
metadata
CHANGED
@@ -1,136 +1,104 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: rsemantic
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.4
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
|
-
authors:
|
7
|
+
authors:
|
7
8
|
- Joseph Wilk
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
12
|
+
date: 2012-09-18 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: gsl
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - '='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.14.7
|
17
22
|
type: :runtime
|
18
|
-
|
19
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name:
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - '='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.14.7
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: stemmer
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.0.1
|
27
38
|
type: :runtime
|
28
|
-
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
version_requirement:
|
39
|
-
version_requirements: !ruby/object:Gem::Requirement
|
40
|
-
requirements:
|
41
|
-
- - ">="
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
version: 1.1.2
|
44
|
-
version:
|
45
|
-
- !ruby/object:Gem::Dependency
|
46
|
-
name: hoe
|
47
|
-
type: :development
|
48
|
-
version_requirement:
|
49
|
-
version_requirements: !ruby/object:Gem::Requirement
|
50
|
-
requirements:
|
51
|
-
- - ">="
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version: 2.3.2
|
54
|
-
version:
|
55
|
-
description: A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency
|
56
|
-
email:
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.0.1
|
46
|
+
description: A document vector search with flexible matrix transforms. Currently supports
|
47
|
+
Latent semantic analysis and Term frequency - inverse document frequency
|
48
|
+
email:
|
57
49
|
- joe@josephwilk.net
|
58
50
|
executables: []
|
59
|
-
|
60
51
|
extensions: []
|
61
|
-
|
62
|
-
extra_rdoc_files:
|
63
|
-
- History.txt
|
64
|
-
- Manifest.txt
|
65
|
-
- README.txt
|
66
|
-
- TODO.txt
|
67
|
-
files:
|
52
|
+
extra_rdoc_files:
|
68
53
|
- History.txt
|
69
|
-
-
|
70
|
-
- README.txt
|
71
|
-
- Rakefile
|
54
|
+
- README.md
|
72
55
|
- TODO.txt
|
73
|
-
|
74
|
-
- config/requirements.rb
|
75
|
-
- gem_tasks/deployment.rake
|
76
|
-
- gem_tasks/environment.rake
|
77
|
-
- gem_tasks/examples.rake
|
78
|
-
- gem_tasks/fix_cr_lf.rake
|
79
|
-
- gem_tasks/gemspec.rake
|
80
|
-
- gem_tasks/rspec.rake
|
81
|
-
- gem_tasks/website.rake
|
82
|
-
- lib/semantic.rb
|
56
|
+
files:
|
83
57
|
- lib/semantic/compare.rb
|
58
|
+
- lib/semantic/corpus.rb
|
59
|
+
- lib/semantic/document.rb
|
84
60
|
- lib/semantic/matrix_transformer.rb
|
85
61
|
- lib/semantic/parser.rb
|
86
62
|
- lib/semantic/search.rb
|
87
|
-
- lib/semantic/
|
63
|
+
- lib/semantic/search_result.rb
|
88
64
|
- lib/semantic/transform/lsa_transform.rb
|
89
65
|
- lib/semantic/transform/tf_idf_transform.rb
|
90
|
-
- lib/semantic/
|
66
|
+
- lib/semantic/transform.rb
|
91
67
|
- lib/semantic/vector_space/builder.rb
|
92
68
|
- lib/semantic/vector_space/model.rb
|
69
|
+
- lib/semantic/vector_space.rb
|
93
70
|
- lib/semantic/version.rb
|
71
|
+
- lib/semantic.rb
|
72
|
+
- lib/tasks/rspec.rake
|
94
73
|
- resources/english.stop
|
95
|
-
-
|
96
|
-
-
|
97
|
-
-
|
98
|
-
- spec/semantic/parser_spec.rb
|
99
|
-
- spec/semantic/search_spec.rb
|
100
|
-
- spec/semantic/transform/lsa_transform_spec.rb
|
101
|
-
- spec/semantic/transform/tf_idf_transform_spec.rb
|
102
|
-
- spec/semantic/vector_space/builder_spec.rb
|
103
|
-
- spec/semantic/vector_space/model_spec.rb
|
104
|
-
- spec/spec.opts
|
105
|
-
- spec/spec_helper.rb
|
106
|
-
has_rdoc: true
|
74
|
+
- History.txt
|
75
|
+
- README.md
|
76
|
+
- TODO.txt
|
107
77
|
homepage: http://github.com/josephwilk/rsemantic
|
108
|
-
licenses:
|
109
|
-
|
78
|
+
licenses:
|
79
|
+
- MIT
|
110
80
|
post_install_message:
|
111
|
-
rdoc_options:
|
112
|
-
- --
|
113
|
-
|
114
|
-
require_paths:
|
81
|
+
rdoc_options:
|
82
|
+
- --charset=UTF-8
|
83
|
+
require_paths:
|
115
84
|
- lib
|
116
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ! '>='
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '0'
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
128
97
|
requirements: []
|
129
|
-
|
130
|
-
|
131
|
-
rubygems_version: 1.3.5
|
98
|
+
rubyforge_project:
|
99
|
+
rubygems_version: 1.8.24
|
132
100
|
signing_key:
|
133
|
-
specification_version:
|
134
|
-
summary: A document vector search with flexible matrix transforms. Currently supports
|
101
|
+
specification_version: 3
|
102
|
+
summary: A document vector search with flexible matrix transforms. Currently supports
|
103
|
+
Latent semantic analysis and Term frequency - inverse document frequency
|
135
104
|
test_files: []
|
136
|
-
|
data/Manifest.txt
DELETED
@@ -1,38 +0,0 @@
|
|
1
|
-
History.txt
|
2
|
-
Manifest.txt
|
3
|
-
README.txt
|
4
|
-
Rakefile
|
5
|
-
TODO.txt
|
6
|
-
config/hoe.rb
|
7
|
-
config/requirements.rb
|
8
|
-
gem_tasks/deployment.rake
|
9
|
-
gem_tasks/environment.rake
|
10
|
-
gem_tasks/examples.rake
|
11
|
-
gem_tasks/fix_cr_lf.rake
|
12
|
-
gem_tasks/gemspec.rake
|
13
|
-
gem_tasks/rspec.rake
|
14
|
-
gem_tasks/website.rake
|
15
|
-
lib/semantic.rb
|
16
|
-
lib/semantic/compare.rb
|
17
|
-
lib/semantic/matrix_transformer.rb
|
18
|
-
lib/semantic/parser.rb
|
19
|
-
lib/semantic/search.rb
|
20
|
-
lib/semantic/transform.rb
|
21
|
-
lib/semantic/transform/lsa_transform.rb
|
22
|
-
lib/semantic/transform/tf_idf_transform.rb
|
23
|
-
lib/semantic/vector_space.rb
|
24
|
-
lib/semantic/vector_space/builder.rb
|
25
|
-
lib/semantic/vector_space/model.rb
|
26
|
-
lib/semantic/version.rb
|
27
|
-
resources/english.stop
|
28
|
-
rsemantic.gemspec
|
29
|
-
spec/semantic/compare_spec.rb
|
30
|
-
spec/semantic/matrix_transformer_spec.rb
|
31
|
-
spec/semantic/parser_spec.rb
|
32
|
-
spec/semantic/search_spec.rb
|
33
|
-
spec/semantic/transform/lsa_transform_spec.rb
|
34
|
-
spec/semantic/transform/tf_idf_transform_spec.rb
|
35
|
-
spec/semantic/vector_space/builder_spec.rb
|
36
|
-
spec/semantic/vector_space/model_spec.rb
|
37
|
-
spec/spec.opts
|
38
|
-
spec/spec_helper.rb
|