rsemantic 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b772bdf3866ef3155cb94364a88c1b2268c2ffe8
4
+ data.tar.gz: b9e832658a877a1b066ab77388780041fe74e9a0
5
+ SHA512:
6
+ metadata.gz: 30ed38b3a259d1dc5fd8398cb5747d6c396d270d37c9bb2b969c7a39be69a9dca70a4b3a1726df85b75782cf575c6752e0a173e10d353321e59cd4d8ff15a4ed
7
+ data.tar.gz: ffd419f02750472c80094c75eea45eea92e0c1bee9ca445cd7b8c79cf8f40bcbf7c6a9b698437c8465e7e72ada8f4fba8d4cc4d0057fe76cafb214f256b58bf6
data/README.md CHANGED
@@ -12,31 +12,24 @@ Documentation: http://github.com/josephwilk/rsemantic/wikis/home
12
12
  ## Requirements:
13
13
 
14
14
  * GSL - http://www.gnu.org/software/gsl
15
- * stemmer - http://rubyforge.org/projects/stemmer/
16
15
 
17
16
  ## INSTALL:
18
17
 
19
- Note 'brew install GSL' installs 1.15 which is not supported yet by the gsl gem. So you have to switch your GSL version to 1.14.
20
- With homebrew try this:
18
+ Rsemantic requires GSL. With homebrew try this:
21
19
 
22
- <pre><code>
23
- git clone git://github.com/josephwilk/rsemantic.git
24
- cd rsemantic
25
-
26
- brew tap homebrew/versions
27
- brew install gsl114
28
- bundle install
29
- </code></pre>
20
+ ```
21
+ brew install gsl
22
+ ```
30
23
 
31
24
  ## Contributors
32
- * @josephwilk
25
+ * [@josephwilk](http://blog.josephwilk.net)
33
26
  * @dominikhonnef
34
27
 
35
28
  ## LICENSE
36
29
 
37
30
  (The MIT License)
38
31
 
39
- Copyright (c) 2008-2012 Joseph Wilk
32
+ Copyright (c) 2008-2014 Joseph Wilk
40
33
 
41
34
  Permission is hereby granted, free of charge, to any person obtaining
42
35
  a copy of this software and associated documentation files (the
@@ -1,25 +1,24 @@
1
1
  $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
- require "semantic/vector_space"
5
- require "semantic/compare"
6
- require "semantic/parser"
7
- require "semantic/matrix_transformer"
8
- require "semantic/search"
9
- require "semantic/transform"
10
- require "semantic/version"
11
-
12
- require "semantic/corpus"
13
- require "semantic/document"
14
- require "semantic/search_result"
15
-
16
- require 'rubygems'
4
+ require "rsemantic/vector_space"
5
+ require "rsemantic/compare"
6
+ require "rsemantic/parser"
7
+ require "rsemantic/matrix_transformer"
8
+ require "rsemantic/search"
9
+ require "rsemantic/transform"
10
+ require "rsemantic/version"
11
+
12
+ require "rsemantic/corpus"
13
+ require "rsemantic/document"
14
+ require "rsemantic/search_result"
15
+
17
16
  require 'gsl'
18
17
 
19
18
  require 'stemmer'
20
19
  require 'logger'
21
20
 
22
- module Semantic
21
+ module RSemantic
23
22
 
24
23
  class << self
25
24
  attr_writer :logger
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class Compare
3
3
 
4
4
  class << self
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class Corpus
3
3
  # @return [Array<Document>]
4
4
  attr_reader :documents
@@ -30,7 +30,7 @@ module Semantic
30
30
  #
31
31
  # @return [void]
32
32
  def build_index
33
- @search = Semantic::Search.new(@documents.map(&:text), @options)
33
+ @search = RSemantic::Search.new(@documents.map(&:text), @options)
34
34
  end
35
35
 
36
36
  def search(*words)
@@ -38,14 +38,14 @@ module Semantic
38
38
  results = @search.search(words)
39
39
  results.map.with_index { |result, index|
40
40
  document = @documents[index]
41
- Semantic::SearchResult.new(document, result)
41
+ RSemantic::SearchResult.new(document, result)
42
42
  }.sort
43
43
  end
44
44
 
45
45
  def find_related_document(document)
46
46
  @search.related(@documents.index(document)).map.with_index { |result, index|
47
47
  document = @documents[index]
48
- Semantic::SearchResult.new(document, result)
48
+ RSemantic::SearchResult.new(document, result)
49
49
  }.sort
50
50
  end
51
51
 
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class Document
3
3
  attr_reader :text
4
4
  attr_reader :attributes
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class MatrixTransformer
3
3
 
4
4
  def initialize(transforms)
@@ -8,13 +8,13 @@ module Semantic
8
8
  def apply_transforms(vector_space_model)
9
9
  @transforms.each do |transform|
10
10
  begin
11
- transform_class = Semantic::Transform.const_get(transform)
12
- Semantic.logger.info("Applying #{transform} transform")
11
+ transform_class = RSemantic::Transform.const_get(transform)
12
+ RSemantic.logger.info("Applying #{transform} transform")
13
13
  transform_class.transform!(vector_space_model.matrix)
14
- Semantic.logger.info(vector_space_model)
14
+ RSemantic.logger.info(vector_space_model)
15
15
  rescue => e
16
- Semantic.logger.error("Error: Cannot perform transform: #{transform}")
17
- Semantic.logger.error(e)
16
+ RSemantic.logger.error("Error: Cannot perform transform: #{transform}")
17
+ RSemantic.logger.error(e)
18
18
  end
19
19
  end
20
20
  vector_space_model
@@ -1,6 +1,6 @@
1
1
  require "set"
2
2
 
3
- module Semantic
3
+ module RSemantic
4
4
  class Parser
5
5
 
6
6
  def initialize(options = {})
@@ -1,5 +1,6 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class Search
3
+ attr_reader :builder
3
4
 
4
5
  def initialize(documents, options = {})
5
6
  options = {
@@ -8,15 +9,19 @@ module Semantic
8
9
  :filter_stop_words => true,
9
10
  :stem_words => true,
10
11
  }.merge(options)
11
- Semantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
12
+ RSemantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
12
13
 
13
14
 
14
- @builder = VectorSpace::Builder.new(:filter_stop_words => options[:filter_stop_words], :stem_words => options[:stem_words])
15
+ @builder = VectorSpace::Builder.new(
16
+ :filter_stop_words => options[:filter_stop_words],
17
+ :stem_words => options[:stem_words],
18
+ :locale => options[:locale]
19
+ )
15
20
  @matrix_transformer = MatrixTransformer.new(options[:transforms])
16
21
 
17
22
  @vector_space_model = @builder.build_document_matrix(documents)
18
23
 
19
- Semantic.logger.info(@vector_space_model)
24
+ RSemantic.logger.info(@vector_space_model)
20
25
 
21
26
  @vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
22
27
  end
@@ -37,5 +42,17 @@ module Semantic
37
42
  end
38
43
  ratings
39
44
  end
45
+
46
+ protected
47
+
48
+ def marshal_dump
49
+ [@builder, @matrix_transformer, @vector_space_model.to_a]
50
+ end
51
+
52
+ def marshal_load(array)
53
+ @builder = array.shift
54
+ @matrix_transformer = array.shift
55
+ @vector_space_model = GSL::Matrix.alloc(*array.shift)
56
+ end
40
57
  end
41
58
  end
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class SearchResult
3
3
  include Comparable
4
4
 
@@ -0,0 +1 @@
1
+ %w{tf_idf lsa}.each{|f| require "rsemantic/transform/#{f}_transform.rb"}
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  module Transform
3
3
  class LSA
4
4
 
@@ -8,11 +8,14 @@ module Semantic
8
8
  # TODO configurable rank
9
9
  columns = matrix.size2
10
10
 
11
- u, v, sigma = matrix.SV_decomp_mod
11
+ # if M < N perform SVD on transponsed matrix
12
+ matrix.size1 < matrix.size2 ? (u, v, sigma = matrix.transpose.SV_decomp_mod) : (u, v, sigma = matrix.SV_decomp_mod)
13
+
12
14
  reduce_dimensions!(sigma, rank)
13
15
  sigma = GSL::Matrix.diagonal(sigma)
14
16
 
15
- GSL::Matrix.swap(matrix, u * sigma * v.transpose)
17
+ # if M < N return transposed result
18
+ matrix.size1 < matrix.size2 ? GSL::Matrix.swap(matrix, (u * sigma * v.transpose).transpose) : GSL::Matrix.swap(matrix, u * sigma * v.transpose)
16
19
  end
17
20
 
18
21
  private
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  module Transform
3
3
  class TFIDF
4
4
 
@@ -0,0 +1 @@
1
+ %w{model builder}.each{|f| require "rsemantic/vector_space/#{f}"}
@@ -1,12 +1,16 @@
1
- module Semantic
1
+ module RSemantic
2
2
  module VectorSpace
3
3
  # A algebraic model for representing text documents as vectors of identifiers.
4
4
  # A document is represented as a vector. Each dimension of the vector corresponds to a
5
5
  # separate term. If a term occurs in the document, then the value in the vector is non-zero.
6
6
  class Builder
7
+ attr_reader :parsed_document_cache
7
8
 
8
9
  def initialize(options = {})
9
- @parser = Parser.new(:filter_stop_words => options[:filter_stop_words])
10
+ @parser = Parser.new(
11
+ :filter_stop_words => options[:filter_stop_words],
12
+ :locale => options[:locale]
13
+ )
10
14
  @parsed_document_cache = []
11
15
  end
12
16
 
@@ -2,7 +2,7 @@ require 'gsl'
2
2
  require 'delegate'
3
3
  require 'stringio'
4
4
 
5
- module Semantic
5
+ module RSemantic
6
6
  module VectorSpace
7
7
 
8
8
  class Model < DelegateClass(::GSL::Matrix)
@@ -1,8 +1,8 @@
1
- module Semantic #:nodoc:
1
+ module RSemantic #:nodoc:
2
2
  class VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 2
5
- TINY = 1
4
+ MINOR = 3
5
+ TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
metadata CHANGED
@@ -1,48 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rsemantic
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
5
- prerelease:
4
+ version: 0.3.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Joseph Wilk
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-01-08 00:00:00.000000000 Z
11
+ date: 2014-03-04 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: gsl
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - '='
17
+ - - '>='
20
18
  - !ruby/object:Gem::Version
21
- version: 1.14.7
19
+ version: '0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - '='
24
+ - - '>='
28
25
  - !ruby/object:Gem::Version
29
- version: 1.14.7
26
+ version: '0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: fast-stemmer
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - '>='
36
32
  - !ruby/object:Gem::Version
37
- version: 1.0.1
33
+ version: '0'
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - '>='
44
39
  - !ruby/object:Gem::Version
45
- version: 1.0.1
40
+ version: '0'
46
41
  description: A document vector search with flexible matrix transforms. Currently supports
47
42
  Latent semantic analysis and Term frequency - inverse document frequency
48
43
  email:
@@ -54,21 +49,24 @@ extra_rdoc_files:
54
49
  - README.md
55
50
  - TODO.txt
56
51
  files:
57
- - lib/semantic/compare.rb
58
- - lib/semantic/corpus.rb
59
- - lib/semantic/document.rb
60
- - lib/semantic/matrix_transformer.rb
61
- - lib/semantic/parser.rb
62
- - lib/semantic/search.rb
63
- - lib/semantic/search_result.rb
64
- - lib/semantic/transform/lsa_transform.rb
65
- - lib/semantic/transform/tf_idf_transform.rb
66
- - lib/semantic/transform.rb
67
- - lib/semantic/vector_space/builder.rb
68
- - lib/semantic/vector_space/model.rb
69
- - lib/semantic/vector_space.rb
70
- - lib/semantic/version.rb
71
- - lib/semantic.rb
52
+ - History.txt
53
+ - README.md
54
+ - TODO.txt
55
+ - lib/rsemantic.rb
56
+ - lib/rsemantic/compare.rb
57
+ - lib/rsemantic/corpus.rb
58
+ - lib/rsemantic/document.rb
59
+ - lib/rsemantic/matrix_transformer.rb
60
+ - lib/rsemantic/parser.rb
61
+ - lib/rsemantic/search.rb
62
+ - lib/rsemantic/search_result.rb
63
+ - lib/rsemantic/transform.rb
64
+ - lib/rsemantic/transform/lsa_transform.rb
65
+ - lib/rsemantic/transform/tf_idf_transform.rb
66
+ - lib/rsemantic/vector_space.rb
67
+ - lib/rsemantic/vector_space/builder.rb
68
+ - lib/rsemantic/vector_space/model.rb
69
+ - lib/rsemantic/version.rb
72
70
  - lib/tasks/rspec.rake
73
71
  - resources/ar.stop
74
72
  - resources/ca.stop
@@ -91,34 +89,30 @@ files:
91
89
  - resources/ru.stop
92
90
  - resources/sv.stop
93
91
  - resources/tr.stop
94
- - History.txt
95
- - README.md
96
- - TODO.txt
97
92
  homepage: http://github.com/josephwilk/rsemantic
98
93
  licenses:
99
94
  - MIT
95
+ metadata: {}
100
96
  post_install_message:
101
97
  rdoc_options:
102
98
  - --charset=UTF-8
103
99
  require_paths:
104
100
  - lib
105
101
  required_ruby_version: !ruby/object:Gem::Requirement
106
- none: false
107
102
  requirements:
108
- - - ! '>='
103
+ - - '>='
109
104
  - !ruby/object:Gem::Version
110
105
  version: '0'
111
106
  required_rubygems_version: !ruby/object:Gem::Requirement
112
- none: false
113
107
  requirements:
114
- - - ! '>='
108
+ - - '>='
115
109
  - !ruby/object:Gem::Version
116
110
  version: '0'
117
111
  requirements: []
118
112
  rubyforge_project:
119
- rubygems_version: 1.8.24
113
+ rubygems_version: 2.2.2
120
114
  signing_key:
121
- specification_version: 3
115
+ specification_version: 4
122
116
  summary: A document vector search with flexible matrix transforms. Currently supports
123
117
  Latent semantic analysis and Term frequency - inverse document frequency
124
118
  test_files: []
@@ -1 +0,0 @@
1
- %w{tf_idf lsa}.each{|f| require "semantic/transform/#{f}_transform.rb"}
@@ -1 +0,0 @@
1
- %w{model builder}.each{|f| require "semantic/vector_space/#{f}"}