rsemantic 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/{README.txt → README.md} +19 -10
  2. data/lib/semantic.rb +8 -5
  3. data/lib/semantic/compare.rb +4 -1
  4. data/lib/semantic/corpus.rb +61 -0
  5. data/lib/semantic/document.rb +39 -0
  6. data/lib/semantic/matrix_transformer.rb +4 -5
  7. data/lib/semantic/parser.rb +22 -10
  8. data/lib/semantic/search.rb +22 -16
  9. data/lib/semantic/search_result.rb +16 -0
  10. data/lib/semantic/transform/lsa_transform.rb +47 -22
  11. data/lib/semantic/transform/tf_idf_transform.rb +12 -23
  12. data/lib/semantic/vector_space/builder.rb +29 -22
  13. data/lib/semantic/vector_space/model.rb +14 -13
  14. data/lib/semantic/version.rb +1 -1
  15. data/lib/tasks/rspec.rake +13 -0
  16. metadata +75 -107
  17. data/Manifest.txt +0 -38
  18. data/Rakefile +0 -9
  19. data/config/hoe.rb +0 -69
  20. data/config/requirements.rb +0 -15
  21. data/gem_tasks/deployment.rake +0 -34
  22. data/gem_tasks/environment.rake +0 -7
  23. data/gem_tasks/examples.rake +0 -29
  24. data/gem_tasks/fix_cr_lf.rake +0 -10
  25. data/gem_tasks/gemspec.rake +0 -6
  26. data/gem_tasks/rspec.rake +0 -33
  27. data/gem_tasks/website.rake +0 -17
  28. data/rsemantic.gemspec +0 -41
  29. data/spec/semantic/compare_spec.rb +0 -16
  30. data/spec/semantic/matrix_transformer_spec.rb +0 -51
  31. data/spec/semantic/parser_spec.rb +0 -34
  32. data/spec/semantic/search_spec.rb +0 -129
  33. data/spec/semantic/transform/lsa_transform_spec.rb +0 -59
  34. data/spec/semantic/transform/tf_idf_transform_spec.rb +0 -35
  35. data/spec/semantic/vector_space/builder_spec.rb +0 -44
  36. data/spec/semantic/vector_space/model_spec.rb +0 -22
  37. data/spec/spec.opts +0 -2
  38. data/spec/spec_helper.rb +0 -7
@@ -3,38 +3,27 @@ module Semantic
3
3
  class TFIDF
4
4
 
5
5
  @@number_of_documents_with_term = []
6
-
7
- def self.transform(matrix)
8
- number_of_documents = matrix.num_columns
6
+ def self.transform!(matrix)
7
+ number_of_documents = matrix.size2
9
8
  @@number_of_documents_with_term = []
10
9
 
11
- matrix.columns.each_with_index do |document, column_index|
12
- document_term_total = document.rows.inject(0.0) {|word_sum, word_count| word_sum + word_count.to_f }
10
+ matrix.transpose.enum_for(:each_row).with_index do |document, column_index|
11
+ document_term_total = document.sum
12
+
13
+ document.enum_for(:each).with_index do |term_weight, row_index|
14
+ unless term_weight == 0.0
15
+ inverse_document_frequency = GSL::Sf.log((number_of_documents /
16
+ number_of_documents_with_term(row_index, matrix).to_f).abs)
17
+ term_frequency = (term_weight / document_term_total)
13
18
 
14
- document.rows.each_with_index do |term_weight, row_index|
15
- unless term_weight.to_f == 0.0
16
- matrix[row_index, column_index] = (term_weight / document_term_total) *
17
- Math.log((number_of_documents / number_of_documents_with_term(row_index, matrix).to_f).abs)
19
+ matrix[row_index, column_index] = term_frequency * inverse_document_frequency
18
20
  end
19
21
  end
20
22
  end
21
- matrix
22
23
  end
23
24
 
24
25
  def self.number_of_documents_with_term(row_index, matrix)
25
- return @@number_of_documents_with_term[row_index] unless @@number_of_documents_with_term[row_index].nil?
26
-
27
- term_document_occurences = 0
28
-
29
- rows,cols = matrix.dimensions
30
-
31
- for n in (0...cols)
32
- if matrix[row_index, n] > 0 #Term appears in document
33
- term_document_occurences += 1
34
- end
35
- end
36
- @@number_of_documents_with_term[row_index] = term_document_occurences
37
- @@number_of_documents_with_term[row_index]
26
+ @@number_of_documents_with_term[row_index] ||= matrix.row(row_index).where.size
38
27
  end
39
28
 
40
29
  end
@@ -1,22 +1,26 @@
1
1
  module Semantic
2
2
  module VectorSpace
3
- #A algebraic model for representing text documents as vectors of identifiers.
4
- #A document is represented as a vector. Each dimension of the vector corresponds to a
5
- #separate term. If a term occurs in the document, then the value in the vector is non-zero.
3
+ # A algebraic model for representing text documents as vectors of identifiers.
4
+ # A document is represented as a vector. Each dimension of the vector corresponds to a
5
+ # separate term. If a term occurs in the document, then the value in the vector is non-zero.
6
6
  class Builder
7
7
 
8
- def initialize(options={})
9
- @parser = Parser.new
10
- @options = options
8
+ def initialize(options = {})
9
+ @parser = Parser.new(:filter_stop_words => options[:filter_stop_words])
11
10
  @parsed_document_cache = []
12
11
  end
13
12
 
14
13
  def build_document_matrix(documents)
15
14
  @vector_keyword_index = build_vector_keyword_index(documents)
16
-
15
+
17
16
  document_vectors = documents.enum_for(:each_with_index).map{|document,document_id| build_vector(document, document_id)}
18
- document_matrix = Linalg::DMatrix.join_columns(document_vectors)
19
-
17
+
18
+ n = document_vectors.size
19
+ m = document_vectors.first.size
20
+
21
+ # TODO check where else we use document_vectors and if we can directly use column based ones
22
+ document_matrix = GSL::Matrix.alloc(*document_vectors.map {|v| v.transpose})
23
+
20
24
  Model.new(document_matrix, @vector_keyword_index)
21
25
  end
22
26
 
@@ -28,7 +32,7 @@ module Semantic
28
32
  def build_vector_keyword_index(documents)
29
33
  parse_and_cache(documents)
30
34
  vocabulary_list = find_unique_vocabulary
31
- map_vocabulary_to_vector_positions(vocabulary_list)
35
+ map_vocabulary_to_vector_positions(vocabulary_list)
32
36
  end
33
37
 
34
38
  def parse_and_cache(documents)
@@ -38,32 +42,35 @@ module Semantic
38
42
  end
39
43
 
40
44
  def find_unique_vocabulary
41
- vocabulary_list = @parsed_document_cache.inject([]) { |parsed_document, vocabulary_list| vocabulary_list + parsed_document }
42
- vocabulary_list.uniq
45
+ @parsed_document_cache.flatten.reverse.uniq
43
46
  end
44
-
47
+
45
48
  def map_vocabulary_to_vector_positions(vocabulary_list)
46
49
  vector_index={}
47
50
  column = 0
48
- vocabulary_list.each do |word|
51
+ vocabulary_list.each do |word|
49
52
  vector_index[word] = column
50
53
  column += 1
51
54
  end
52
55
  vector_index
53
- end
54
-
55
- def build_vector(word_string, document_id=nil)
56
+ end
57
+
58
+ def build_vector(word_string, document_id = nil)
56
59
  if document_id.nil?
57
60
  word_list = @parser.tokenise_and_filter(word_string)
58
61
  else
59
62
  word_list = @parsed_document_cache[document_id]
60
63
  end
61
-
62
- vector = Linalg::DMatrix.new(@vector_keyword_index.length, 1)
63
- word_list.each { |word| vector[@vector_keyword_index[word] , 0] += 1 if @vector_keyword_index.has_key?(word) }
64
+
65
+ vector = GSL::Vector.alloc(@vector_keyword_index.length)
66
+ word_list.each { |word|
67
+ if @vector_keyword_index.has_key?(word)
68
+ vector[@vector_keyword_index[word]] += 1
69
+ end
70
+ }
71
+
64
72
  vector
65
73
  end
66
-
67
74
  end
68
75
  end
69
- end
76
+ end
@@ -1,39 +1,40 @@
1
- require 'linalg'
1
+ require 'gsl'
2
2
  require 'delegate'
3
3
  require 'stringio'
4
4
 
5
5
  module Semantic
6
6
  module VectorSpace
7
7
 
8
- class Model < DelegateClass(::Linalg::DMatrix)
9
-
8
+ class Model < DelegateClass(::GSL::Matrix)
10
9
  def initialize(matrix, keywords)
11
10
  @keywords = keywords || {}
12
11
  @_dc_obj = matrix
13
12
  super(matrix)
14
13
  end
15
-
14
+
16
15
  def matrix=(matrix)
17
16
  @_dc_obj = matrix
18
17
  end
19
-
18
+
20
19
  def matrix
21
20
  @_dc_obj
22
21
  end
23
-
22
+
24
23
  def to_s
25
24
  out = StringIO.new
26
25
  out.print " " * 9
27
-
28
- matrix.ncol.times do |id|
29
- out.print " D#{id+1} "
26
+
27
+ matrix.size2.times do |id|
28
+ out.print " D#{id+1} "
30
29
  end
31
30
  out.puts
32
31
 
33
- matrix.rows.each_with_index do |terms, index|
34
- out.print "#{@keywords.index(index).ljust(6)}" if @keywords.has_value?(index)
32
+ matrix.to_a.each_with_index do |terms, index|
33
+ # TODO fix for 1.8.7
34
+ out.print "#{@keywords.key(index).ljust(6)}" if @keywords.has_value?(index)
35
35
  out.print "[ "
36
- terms.columns.each do |document|
36
+
37
+ terms.each do |document|
37
38
  out.print "%+0.2f " % document
38
39
  end
39
40
  out.print "]"
@@ -41,7 +42,7 @@ module Semantic
41
42
  end
42
43
  out.string
43
44
  end
44
-
45
+
45
46
  end
46
47
  end
47
48
  end
@@ -2,7 +2,7 @@ module Semantic #:nodoc:
2
2
  class VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 1
5
- TINY = 3
5
+ TINY = 4
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -0,0 +1,13 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ desc 'Default: run specs.'
4
+ task :default => :spec
5
+
6
+ desc "Run specs"
7
+ RSpec::Core::RakeTask.new
8
+
9
+ desc "Generate code coverage"
10
+ RSpec::Core::RakeTask.new(:coverage) do |t|
11
+ t.rcov = true
12
+ t.rcov_opts = ['--exclude', 'spec']
13
+ end
metadata CHANGED
@@ -1,136 +1,104 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: rsemantic
3
- version: !ruby/object:Gem::Version
4
- version: 0.1.3
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.4
5
+ prerelease:
5
6
  platform: ruby
6
- authors:
7
+ authors:
7
8
  - Joseph Wilk
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
-
12
- date: 2009-08-01 00:00:00 +01:00
13
- default_executable:
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: term-ansicolor
12
+ date: 2012-09-18 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: gsl
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - '='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.14.7
17
22
  type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
20
- requirements:
21
- - - ">="
22
- - !ruby/object:Gem::Version
23
- version: 1.0.3
24
- version:
25
- - !ruby/object:Gem::Dependency
26
- name: rspec
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - '='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.14.7
30
+ - !ruby/object:Gem::Dependency
31
+ name: stemmer
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: 1.0.1
27
38
  type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: 1.1.5
34
- version:
35
- - !ruby/object:Gem::Dependency
36
- name: diff-lcs
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
40
- requirements:
41
- - - ">="
42
- - !ruby/object:Gem::Version
43
- version: 1.1.2
44
- version:
45
- - !ruby/object:Gem::Dependency
46
- name: hoe
47
- type: :development
48
- version_requirement:
49
- version_requirements: !ruby/object:Gem::Requirement
50
- requirements:
51
- - - ">="
52
- - !ruby/object:Gem::Version
53
- version: 2.3.2
54
- version:
55
- description: A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency
56
- email:
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: 1.0.1
46
+ description: A document vector search with flexible matrix transforms. Currently supports
47
+ Latent semantic analysis and Term frequency - inverse document frequency
48
+ email:
57
49
  - joe@josephwilk.net
58
50
  executables: []
59
-
60
51
  extensions: []
61
-
62
- extra_rdoc_files:
63
- - History.txt
64
- - Manifest.txt
65
- - README.txt
66
- - TODO.txt
67
- files:
52
+ extra_rdoc_files:
68
53
  - History.txt
69
- - Manifest.txt
70
- - README.txt
71
- - Rakefile
54
+ - README.md
72
55
  - TODO.txt
73
- - config/hoe.rb
74
- - config/requirements.rb
75
- - gem_tasks/deployment.rake
76
- - gem_tasks/environment.rake
77
- - gem_tasks/examples.rake
78
- - gem_tasks/fix_cr_lf.rake
79
- - gem_tasks/gemspec.rake
80
- - gem_tasks/rspec.rake
81
- - gem_tasks/website.rake
82
- - lib/semantic.rb
56
+ files:
83
57
  - lib/semantic/compare.rb
58
+ - lib/semantic/corpus.rb
59
+ - lib/semantic/document.rb
84
60
  - lib/semantic/matrix_transformer.rb
85
61
  - lib/semantic/parser.rb
86
62
  - lib/semantic/search.rb
87
- - lib/semantic/transform.rb
63
+ - lib/semantic/search_result.rb
88
64
  - lib/semantic/transform/lsa_transform.rb
89
65
  - lib/semantic/transform/tf_idf_transform.rb
90
- - lib/semantic/vector_space.rb
66
+ - lib/semantic/transform.rb
91
67
  - lib/semantic/vector_space/builder.rb
92
68
  - lib/semantic/vector_space/model.rb
69
+ - lib/semantic/vector_space.rb
93
70
  - lib/semantic/version.rb
71
+ - lib/semantic.rb
72
+ - lib/tasks/rspec.rake
94
73
  - resources/english.stop
95
- - rsemantic.gemspec
96
- - spec/semantic/compare_spec.rb
97
- - spec/semantic/matrix_transformer_spec.rb
98
- - spec/semantic/parser_spec.rb
99
- - spec/semantic/search_spec.rb
100
- - spec/semantic/transform/lsa_transform_spec.rb
101
- - spec/semantic/transform/tf_idf_transform_spec.rb
102
- - spec/semantic/vector_space/builder_spec.rb
103
- - spec/semantic/vector_space/model_spec.rb
104
- - spec/spec.opts
105
- - spec/spec_helper.rb
106
- has_rdoc: true
74
+ - History.txt
75
+ - README.md
76
+ - TODO.txt
107
77
  homepage: http://github.com/josephwilk/rsemantic
108
- licenses: []
109
-
78
+ licenses:
79
+ - MIT
110
80
  post_install_message:
111
- rdoc_options:
112
- - --main
113
- - README.txt
114
- require_paths:
81
+ rdoc_options:
82
+ - --charset=UTF-8
83
+ require_paths:
115
84
  - lib
116
- required_ruby_version: !ruby/object:Gem::Requirement
117
- requirements:
118
- - - ">="
119
- - !ruby/object:Gem::Version
120
- version: "0"
121
- version:
122
- required_rubygems_version: !ruby/object:Gem::Requirement
123
- requirements:
124
- - - ">="
125
- - !ruby/object:Gem::Version
126
- version: "0"
127
- version:
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ! '>='
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
128
97
  requirements: []
129
-
130
- rubyforge_project: rsemantic
131
- rubygems_version: 1.3.5
98
+ rubyforge_project:
99
+ rubygems_version: 1.8.24
132
100
  signing_key:
133
- specification_version: 2
134
- summary: A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency
101
+ specification_version: 3
102
+ summary: A document vector search with flexible matrix transforms. Currently supports
103
+ Latent semantic analysis and Term frequency - inverse document frequency
135
104
  test_files: []
136
-
data/Manifest.txt DELETED
@@ -1,38 +0,0 @@
1
- History.txt
2
- Manifest.txt
3
- README.txt
4
- Rakefile
5
- TODO.txt
6
- config/hoe.rb
7
- config/requirements.rb
8
- gem_tasks/deployment.rake
9
- gem_tasks/environment.rake
10
- gem_tasks/examples.rake
11
- gem_tasks/fix_cr_lf.rake
12
- gem_tasks/gemspec.rake
13
- gem_tasks/rspec.rake
14
- gem_tasks/website.rake
15
- lib/semantic.rb
16
- lib/semantic/compare.rb
17
- lib/semantic/matrix_transformer.rb
18
- lib/semantic/parser.rb
19
- lib/semantic/search.rb
20
- lib/semantic/transform.rb
21
- lib/semantic/transform/lsa_transform.rb
22
- lib/semantic/transform/tf_idf_transform.rb
23
- lib/semantic/vector_space.rb
24
- lib/semantic/vector_space/builder.rb
25
- lib/semantic/vector_space/model.rb
26
- lib/semantic/version.rb
27
- resources/english.stop
28
- rsemantic.gemspec
29
- spec/semantic/compare_spec.rb
30
- spec/semantic/matrix_transformer_spec.rb
31
- spec/semantic/parser_spec.rb
32
- spec/semantic/search_spec.rb
33
- spec/semantic/transform/lsa_transform_spec.rb
34
- spec/semantic/transform/tf_idf_transform_spec.rb
35
- spec/semantic/vector_space/builder_spec.rb
36
- spec/semantic/vector_space/model_spec.rb
37
- spec/spec.opts
38
- spec/spec_helper.rb