rsemantic 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b772bdf3866ef3155cb94364a88c1b2268c2ffe8
4
+ data.tar.gz: b9e832658a877a1b066ab77388780041fe74e9a0
5
+ SHA512:
6
+ metadata.gz: 30ed38b3a259d1dc5fd8398cb5747d6c396d270d37c9bb2b969c7a39be69a9dca70a4b3a1726df85b75782cf575c6752e0a173e10d353321e59cd4d8ff15a4ed
7
+ data.tar.gz: ffd419f02750472c80094c75eea45eea92e0c1bee9ca445cd7b8c79cf8f40bcbf7c6a9b698437c8465e7e72ada8f4fba8d4cc4d0057fe76cafb214f256b58bf6
data/README.md CHANGED
@@ -12,31 +12,24 @@ Documentation: http://github.com/josephwilk/rsemantic/wikis/home
12
12
  ## Requirements:
13
13
 
14
14
  * GSL - http://www.gnu.org/software/gsl
15
- * stemmer - http://rubyforge.org/projects/stemmer/
16
15
 
17
16
  ## INSTALL:
18
17
 
19
- Note 'brew install GSL' installs 1.15 which is not supported yet by the gsl gem. So you have to switch your GSL version to 1.14.
20
- With homebrew try this:
18
+ Rsemantic requires GSL. With homebrew try this:
21
19
 
22
- <pre><code>
23
- git clone git://github.com/josephwilk/rsemantic.git
24
- cd rsemantic
25
-
26
- brew tap homebrew/versions
27
- brew install gsl114
28
- bundle install
29
- </code></pre>
20
+ ```
21
+ brew install gsl
22
+ ```
30
23
 
31
24
  ## Contributors
32
- * @josephwilk
25
+ * [@josephwilk](http://blog.josephwilk.net)
33
26
  * @dominikhonnef
34
27
 
35
28
  ## LICENSE
36
29
 
37
30
  (The MIT License)
38
31
 
39
- Copyright (c) 2008-2012 Joseph Wilk
32
+ Copyright (c) 2008-2014 Joseph Wilk
40
33
 
41
34
  Permission is hereby granted, free of charge, to any person obtaining
42
35
  a copy of this software and associated documentation files (the
@@ -1,25 +1,24 @@
1
1
  $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
- require "semantic/vector_space"
5
- require "semantic/compare"
6
- require "semantic/parser"
7
- require "semantic/matrix_transformer"
8
- require "semantic/search"
9
- require "semantic/transform"
10
- require "semantic/version"
11
-
12
- require "semantic/corpus"
13
- require "semantic/document"
14
- require "semantic/search_result"
15
-
16
- require 'rubygems'
4
+ require "rsemantic/vector_space"
5
+ require "rsemantic/compare"
6
+ require "rsemantic/parser"
7
+ require "rsemantic/matrix_transformer"
8
+ require "rsemantic/search"
9
+ require "rsemantic/transform"
10
+ require "rsemantic/version"
11
+
12
+ require "rsemantic/corpus"
13
+ require "rsemantic/document"
14
+ require "rsemantic/search_result"
15
+
17
16
  require 'gsl'
18
17
 
19
18
  require 'stemmer'
20
19
  require 'logger'
21
20
 
22
- module Semantic
21
+ module RSemantic
23
22
 
24
23
  class << self
25
24
  attr_writer :logger
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class Compare
3
3
 
4
4
  class << self
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class Corpus
3
3
  # @return [Array<Document>]
4
4
  attr_reader :documents
@@ -30,7 +30,7 @@ module Semantic
30
30
  #
31
31
  # @return [void]
32
32
  def build_index
33
- @search = Semantic::Search.new(@documents.map(&:text), @options)
33
+ @search = RSemantic::Search.new(@documents.map(&:text), @options)
34
34
  end
35
35
 
36
36
  def search(*words)
@@ -38,14 +38,14 @@ module Semantic
38
38
  results = @search.search(words)
39
39
  results.map.with_index { |result, index|
40
40
  document = @documents[index]
41
- Semantic::SearchResult.new(document, result)
41
+ RSemantic::SearchResult.new(document, result)
42
42
  }.sort
43
43
  end
44
44
 
45
45
  def find_related_document(document)
46
46
  @search.related(@documents.index(document)).map.with_index { |result, index|
47
47
  document = @documents[index]
48
- Semantic::SearchResult.new(document, result)
48
+ RSemantic::SearchResult.new(document, result)
49
49
  }.sort
50
50
  end
51
51
 
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class Document
3
3
  attr_reader :text
4
4
  attr_reader :attributes
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class MatrixTransformer
3
3
 
4
4
  def initialize(transforms)
@@ -8,13 +8,13 @@ module Semantic
8
8
  def apply_transforms(vector_space_model)
9
9
  @transforms.each do |transform|
10
10
  begin
11
- transform_class = Semantic::Transform.const_get(transform)
12
- Semantic.logger.info("Applying #{transform} transform")
11
+ transform_class = RSemantic::Transform.const_get(transform)
12
+ RSemantic.logger.info("Applying #{transform} transform")
13
13
  transform_class.transform!(vector_space_model.matrix)
14
- Semantic.logger.info(vector_space_model)
14
+ RSemantic.logger.info(vector_space_model)
15
15
  rescue => e
16
- Semantic.logger.error("Error: Cannot perform transform: #{transform}")
17
- Semantic.logger.error(e)
16
+ RSemantic.logger.error("Error: Cannot perform transform: #{transform}")
17
+ RSemantic.logger.error(e)
18
18
  end
19
19
  end
20
20
  vector_space_model
@@ -1,6 +1,6 @@
1
1
  require "set"
2
2
 
3
- module Semantic
3
+ module RSemantic
4
4
  class Parser
5
5
 
6
6
  def initialize(options = {})
@@ -1,5 +1,6 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class Search
3
+ attr_reader :builder
3
4
 
4
5
  def initialize(documents, options = {})
5
6
  options = {
@@ -8,15 +9,19 @@ module Semantic
8
9
  :filter_stop_words => true,
9
10
  :stem_words => true,
10
11
  }.merge(options)
11
- Semantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
12
+ RSemantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
12
13
 
13
14
 
14
- @builder = VectorSpace::Builder.new(:filter_stop_words => options[:filter_stop_words], :stem_words => options[:stem_words])
15
+ @builder = VectorSpace::Builder.new(
16
+ :filter_stop_words => options[:filter_stop_words],
17
+ :stem_words => options[:stem_words],
18
+ :locale => options[:locale]
19
+ )
15
20
  @matrix_transformer = MatrixTransformer.new(options[:transforms])
16
21
 
17
22
  @vector_space_model = @builder.build_document_matrix(documents)
18
23
 
19
- Semantic.logger.info(@vector_space_model)
24
+ RSemantic.logger.info(@vector_space_model)
20
25
 
21
26
  @vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
22
27
  end
@@ -37,5 +42,17 @@ module Semantic
37
42
  end
38
43
  ratings
39
44
  end
45
+
46
+ protected
47
+
48
+ def marshal_dump
49
+ [@builder, @matrix_transformer, @vector_space_model.to_a]
50
+ end
51
+
52
+ def marshal_load(array)
53
+ @builder = array.shift
54
+ @matrix_transformer = array.shift
55
+ @vector_space_model = GSL::Matrix.alloc(*array.shift)
56
+ end
40
57
  end
41
58
  end
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  class SearchResult
3
3
  include Comparable
4
4
 
@@ -0,0 +1 @@
1
+ %w{tf_idf lsa}.each{|f| require "rsemantic/transform/#{f}_transform.rb"}
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  module Transform
3
3
  class LSA
4
4
 
@@ -8,11 +8,14 @@ module Semantic
8
8
  # TODO configurable rank
9
9
  columns = matrix.size2
10
10
 
11
- u, v, sigma = matrix.SV_decomp_mod
11
+ # if M < N perform SVD on transponsed matrix
12
+ matrix.size1 < matrix.size2 ? (u, v, sigma = matrix.transpose.SV_decomp_mod) : (u, v, sigma = matrix.SV_decomp_mod)
13
+
12
14
  reduce_dimensions!(sigma, rank)
13
15
  sigma = GSL::Matrix.diagonal(sigma)
14
16
 
15
- GSL::Matrix.swap(matrix, u * sigma * v.transpose)
17
+ # if M < N return transposed result
18
+ matrix.size1 < matrix.size2 ? GSL::Matrix.swap(matrix, (u * sigma * v.transpose).transpose) : GSL::Matrix.swap(matrix, u * sigma * v.transpose)
16
19
  end
17
20
 
18
21
  private
@@ -1,4 +1,4 @@
1
- module Semantic
1
+ module RSemantic
2
2
  module Transform
3
3
  class TFIDF
4
4
 
@@ -0,0 +1 @@
1
+ %w{model builder}.each{|f| require "rsemantic/vector_space/#{f}"}
@@ -1,12 +1,16 @@
1
- module Semantic
1
+ module RSemantic
2
2
  module VectorSpace
3
3
  # A algebraic model for representing text documents as vectors of identifiers.
4
4
  # A document is represented as a vector. Each dimension of the vector corresponds to a
5
5
  # separate term. If a term occurs in the document, then the value in the vector is non-zero.
6
6
  class Builder
7
+ attr_reader :parsed_document_cache
7
8
 
8
9
  def initialize(options = {})
9
- @parser = Parser.new(:filter_stop_words => options[:filter_stop_words])
10
+ @parser = Parser.new(
11
+ :filter_stop_words => options[:filter_stop_words],
12
+ :locale => options[:locale]
13
+ )
10
14
  @parsed_document_cache = []
11
15
  end
12
16
 
@@ -2,7 +2,7 @@ require 'gsl'
2
2
  require 'delegate'
3
3
  require 'stringio'
4
4
 
5
- module Semantic
5
+ module RSemantic
6
6
  module VectorSpace
7
7
 
8
8
  class Model < DelegateClass(::GSL::Matrix)
@@ -1,8 +1,8 @@
1
- module Semantic #:nodoc:
1
+ module RSemantic #:nodoc:
2
2
  class VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 2
5
- TINY = 1
4
+ MINOR = 3
5
+ TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
metadata CHANGED
@@ -1,48 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rsemantic
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
5
- prerelease:
4
+ version: 0.3.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Joseph Wilk
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-01-08 00:00:00.000000000 Z
11
+ date: 2014-03-04 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: gsl
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - '='
17
+ - - '>='
20
18
  - !ruby/object:Gem::Version
21
- version: 1.14.7
19
+ version: '0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - '='
24
+ - - '>='
28
25
  - !ruby/object:Gem::Version
29
- version: 1.14.7
26
+ version: '0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: fast-stemmer
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - '>='
36
32
  - !ruby/object:Gem::Version
37
- version: 1.0.1
33
+ version: '0'
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - '>='
44
39
  - !ruby/object:Gem::Version
45
- version: 1.0.1
40
+ version: '0'
46
41
  description: A document vector search with flexible matrix transforms. Currently supports
47
42
  Latent semantic analysis and Term frequency - inverse document frequency
48
43
  email:
@@ -54,21 +49,24 @@ extra_rdoc_files:
54
49
  - README.md
55
50
  - TODO.txt
56
51
  files:
57
- - lib/semantic/compare.rb
58
- - lib/semantic/corpus.rb
59
- - lib/semantic/document.rb
60
- - lib/semantic/matrix_transformer.rb
61
- - lib/semantic/parser.rb
62
- - lib/semantic/search.rb
63
- - lib/semantic/search_result.rb
64
- - lib/semantic/transform/lsa_transform.rb
65
- - lib/semantic/transform/tf_idf_transform.rb
66
- - lib/semantic/transform.rb
67
- - lib/semantic/vector_space/builder.rb
68
- - lib/semantic/vector_space/model.rb
69
- - lib/semantic/vector_space.rb
70
- - lib/semantic/version.rb
71
- - lib/semantic.rb
52
+ - History.txt
53
+ - README.md
54
+ - TODO.txt
55
+ - lib/rsemantic.rb
56
+ - lib/rsemantic/compare.rb
57
+ - lib/rsemantic/corpus.rb
58
+ - lib/rsemantic/document.rb
59
+ - lib/rsemantic/matrix_transformer.rb
60
+ - lib/rsemantic/parser.rb
61
+ - lib/rsemantic/search.rb
62
+ - lib/rsemantic/search_result.rb
63
+ - lib/rsemantic/transform.rb
64
+ - lib/rsemantic/transform/lsa_transform.rb
65
+ - lib/rsemantic/transform/tf_idf_transform.rb
66
+ - lib/rsemantic/vector_space.rb
67
+ - lib/rsemantic/vector_space/builder.rb
68
+ - lib/rsemantic/vector_space/model.rb
69
+ - lib/rsemantic/version.rb
72
70
  - lib/tasks/rspec.rake
73
71
  - resources/ar.stop
74
72
  - resources/ca.stop
@@ -91,34 +89,30 @@ files:
91
89
  - resources/ru.stop
92
90
  - resources/sv.stop
93
91
  - resources/tr.stop
94
- - History.txt
95
- - README.md
96
- - TODO.txt
97
92
  homepage: http://github.com/josephwilk/rsemantic
98
93
  licenses:
99
94
  - MIT
95
+ metadata: {}
100
96
  post_install_message:
101
97
  rdoc_options:
102
98
  - --charset=UTF-8
103
99
  require_paths:
104
100
  - lib
105
101
  required_ruby_version: !ruby/object:Gem::Requirement
106
- none: false
107
102
  requirements:
108
- - - ! '>='
103
+ - - '>='
109
104
  - !ruby/object:Gem::Version
110
105
  version: '0'
111
106
  required_rubygems_version: !ruby/object:Gem::Requirement
112
- none: false
113
107
  requirements:
114
- - - ! '>='
108
+ - - '>='
115
109
  - !ruby/object:Gem::Version
116
110
  version: '0'
117
111
  requirements: []
118
112
  rubyforge_project:
119
- rubygems_version: 1.8.24
113
+ rubygems_version: 2.2.2
120
114
  signing_key:
121
- specification_version: 3
115
+ specification_version: 4
122
116
  summary: A document vector search with flexible matrix transforms. Currently supports
123
117
  Latent semantic analysis and Term frequency - inverse document frequency
124
118
  test_files: []
@@ -1 +0,0 @@
1
- %w{tf_idf lsa}.each{|f| require "semantic/transform/#{f}_transform.rb"}
@@ -1 +0,0 @@
1
- %w{model builder}.each{|f| require "semantic/vector_space/#{f}"}