rsemantic 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +6 -13
- data/lib/{semantic.rb → rsemantic.rb} +13 -14
- data/lib/{semantic → rsemantic}/compare.rb +1 -1
- data/lib/{semantic → rsemantic}/corpus.rb +4 -4
- data/lib/{semantic → rsemantic}/document.rb +1 -1
- data/lib/{semantic → rsemantic}/matrix_transformer.rb +6 -6
- data/lib/{semantic → rsemantic}/parser.rb +1 -1
- data/lib/{semantic → rsemantic}/search.rb +21 -4
- data/lib/{semantic → rsemantic}/search_result.rb +1 -1
- data/lib/rsemantic/transform.rb +1 -0
- data/lib/{semantic → rsemantic}/transform/lsa_transform.rb +6 -3
- data/lib/{semantic → rsemantic}/transform/tf_idf_transform.rb +1 -1
- data/lib/rsemantic/vector_space.rb +1 -0
- data/lib/{semantic → rsemantic}/vector_space/builder.rb +6 -2
- data/lib/{semantic → rsemantic}/vector_space/model.rb +1 -1
- data/lib/{semantic → rsemantic}/version.rb +3 -3
- metadata +33 -39
- data/lib/semantic/transform.rb +0 -1
- data/lib/semantic/vector_space.rb +0 -1
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b772bdf3866ef3155cb94364a88c1b2268c2ffe8
|
4
|
+
data.tar.gz: b9e832658a877a1b066ab77388780041fe74e9a0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 30ed38b3a259d1dc5fd8398cb5747d6c396d270d37c9bb2b969c7a39be69a9dca70a4b3a1726df85b75782cf575c6752e0a173e10d353321e59cd4d8ff15a4ed
|
7
|
+
data.tar.gz: ffd419f02750472c80094c75eea45eea92e0c1bee9ca445cd7b8c79cf8f40bcbf7c6a9b698437c8465e7e72ada8f4fba8d4cc4d0057fe76cafb214f256b58bf6
|
data/README.md
CHANGED
@@ -12,31 +12,24 @@ Documentation: http://github.com/josephwilk/rsemantic/wikis/home
|
|
12
12
|
## Requirements:
|
13
13
|
|
14
14
|
* GSL - http://www.gnu.org/software/gsl
|
15
|
-
* stemmer - http://rubyforge.org/projects/stemmer/
|
16
15
|
|
17
16
|
## INSTALL:
|
18
17
|
|
19
|
-
|
20
|
-
With homebrew try this:
|
18
|
+
Rsemantic requires GSL. With homebrew try this:
|
21
19
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
brew tap homebrew/versions
|
27
|
-
brew install gsl114
|
28
|
-
bundle install
|
29
|
-
</code></pre>
|
20
|
+
```
|
21
|
+
brew install gsl
|
22
|
+
```
|
30
23
|
|
31
24
|
## Contributors
|
32
|
-
* @josephwilk
|
25
|
+
* [@josephwilk](http://blog.josephwilk.net)
|
33
26
|
* @dominikhonnef
|
34
27
|
|
35
28
|
## LICENSE
|
36
29
|
|
37
30
|
(The MIT License)
|
38
31
|
|
39
|
-
Copyright (c) 2008-
|
32
|
+
Copyright (c) 2008-2014 Joseph Wilk
|
40
33
|
|
41
34
|
Permission is hereby granted, free of charge, to any person obtaining
|
42
35
|
a copy of this software and associated documentation files (the
|
@@ -1,25 +1,24 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__)) unless
|
2
2
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
3
|
|
4
|
-
require "
|
5
|
-
require "
|
6
|
-
require "
|
7
|
-
require "
|
8
|
-
require "
|
9
|
-
require "
|
10
|
-
require "
|
11
|
-
|
12
|
-
require "
|
13
|
-
require "
|
14
|
-
require "
|
15
|
-
|
16
|
-
require 'rubygems'
|
4
|
+
require "rsemantic/vector_space"
|
5
|
+
require "rsemantic/compare"
|
6
|
+
require "rsemantic/parser"
|
7
|
+
require "rsemantic/matrix_transformer"
|
8
|
+
require "rsemantic/search"
|
9
|
+
require "rsemantic/transform"
|
10
|
+
require "rsemantic/version"
|
11
|
+
|
12
|
+
require "rsemantic/corpus"
|
13
|
+
require "rsemantic/document"
|
14
|
+
require "rsemantic/search_result"
|
15
|
+
|
17
16
|
require 'gsl'
|
18
17
|
|
19
18
|
require 'stemmer'
|
20
19
|
require 'logger'
|
21
20
|
|
22
|
-
module
|
21
|
+
module RSemantic
|
23
22
|
|
24
23
|
class << self
|
25
24
|
attr_writer :logger
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module RSemantic
|
2
2
|
class Corpus
|
3
3
|
# @return [Array<Document>]
|
4
4
|
attr_reader :documents
|
@@ -30,7 +30,7 @@ module Semantic
|
|
30
30
|
#
|
31
31
|
# @return [void]
|
32
32
|
def build_index
|
33
|
-
@search =
|
33
|
+
@search = RSemantic::Search.new(@documents.map(&:text), @options)
|
34
34
|
end
|
35
35
|
|
36
36
|
def search(*words)
|
@@ -38,14 +38,14 @@ module Semantic
|
|
38
38
|
results = @search.search(words)
|
39
39
|
results.map.with_index { |result, index|
|
40
40
|
document = @documents[index]
|
41
|
-
|
41
|
+
RSemantic::SearchResult.new(document, result)
|
42
42
|
}.sort
|
43
43
|
end
|
44
44
|
|
45
45
|
def find_related_document(document)
|
46
46
|
@search.related(@documents.index(document)).map.with_index { |result, index|
|
47
47
|
document = @documents[index]
|
48
|
-
|
48
|
+
RSemantic::SearchResult.new(document, result)
|
49
49
|
}.sort
|
50
50
|
end
|
51
51
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module RSemantic
|
2
2
|
class MatrixTransformer
|
3
3
|
|
4
4
|
def initialize(transforms)
|
@@ -8,13 +8,13 @@ module Semantic
|
|
8
8
|
def apply_transforms(vector_space_model)
|
9
9
|
@transforms.each do |transform|
|
10
10
|
begin
|
11
|
-
transform_class =
|
12
|
-
|
11
|
+
transform_class = RSemantic::Transform.const_get(transform)
|
12
|
+
RSemantic.logger.info("Applying #{transform} transform")
|
13
13
|
transform_class.transform!(vector_space_model.matrix)
|
14
|
-
|
14
|
+
RSemantic.logger.info(vector_space_model)
|
15
15
|
rescue => e
|
16
|
-
|
17
|
-
|
16
|
+
RSemantic.logger.error("Error: Cannot perform transform: #{transform}")
|
17
|
+
RSemantic.logger.error(e)
|
18
18
|
end
|
19
19
|
end
|
20
20
|
vector_space_model
|
@@ -1,5 +1,6 @@
|
|
1
|
-
module
|
1
|
+
module RSemantic
|
2
2
|
class Search
|
3
|
+
attr_reader :builder
|
3
4
|
|
4
5
|
def initialize(documents, options = {})
|
5
6
|
options = {
|
@@ -8,15 +9,19 @@ module Semantic
|
|
8
9
|
:filter_stop_words => true,
|
9
10
|
:stem_words => true,
|
10
11
|
}.merge(options)
|
11
|
-
|
12
|
+
RSemantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
|
12
13
|
|
13
14
|
|
14
|
-
@builder = VectorSpace::Builder.new(
|
15
|
+
@builder = VectorSpace::Builder.new(
|
16
|
+
:filter_stop_words => options[:filter_stop_words],
|
17
|
+
:stem_words => options[:stem_words],
|
18
|
+
:locale => options[:locale]
|
19
|
+
)
|
15
20
|
@matrix_transformer = MatrixTransformer.new(options[:transforms])
|
16
21
|
|
17
22
|
@vector_space_model = @builder.build_document_matrix(documents)
|
18
23
|
|
19
|
-
|
24
|
+
RSemantic.logger.info(@vector_space_model)
|
20
25
|
|
21
26
|
@vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
|
22
27
|
end
|
@@ -37,5 +42,17 @@ module Semantic
|
|
37
42
|
end
|
38
43
|
ratings
|
39
44
|
end
|
45
|
+
|
46
|
+
protected
|
47
|
+
|
48
|
+
def marshal_dump
|
49
|
+
[@builder, @matrix_transformer, @vector_space_model.to_a]
|
50
|
+
end
|
51
|
+
|
52
|
+
def marshal_load(array)
|
53
|
+
@builder = array.shift
|
54
|
+
@matrix_transformer = array.shift
|
55
|
+
@vector_space_model = GSL::Matrix.alloc(*array.shift)
|
56
|
+
end
|
40
57
|
end
|
41
58
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
%w{tf_idf lsa}.each{|f| require "rsemantic/transform/#{f}_transform.rb"}
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module RSemantic
|
2
2
|
module Transform
|
3
3
|
class LSA
|
4
4
|
|
@@ -8,11 +8,14 @@ module Semantic
|
|
8
8
|
# TODO configurable rank
|
9
9
|
columns = matrix.size2
|
10
10
|
|
11
|
-
|
11
|
+
# if M < N perform SVD on transponsed matrix
|
12
|
+
matrix.size1 < matrix.size2 ? (u, v, sigma = matrix.transpose.SV_decomp_mod) : (u, v, sigma = matrix.SV_decomp_mod)
|
13
|
+
|
12
14
|
reduce_dimensions!(sigma, rank)
|
13
15
|
sigma = GSL::Matrix.diagonal(sigma)
|
14
16
|
|
15
|
-
|
17
|
+
# if M < N return transposed result
|
18
|
+
matrix.size1 < matrix.size2 ? GSL::Matrix.swap(matrix, (u * sigma * v.transpose).transpose) : GSL::Matrix.swap(matrix, u * sigma * v.transpose)
|
16
19
|
end
|
17
20
|
|
18
21
|
private
|
@@ -0,0 +1 @@
|
|
1
|
+
%w{model builder}.each{|f| require "rsemantic/vector_space/#{f}"}
|
@@ -1,12 +1,16 @@
|
|
1
|
-
module
|
1
|
+
module RSemantic
|
2
2
|
module VectorSpace
|
3
3
|
# A algebraic model for representing text documents as vectors of identifiers.
|
4
4
|
# A document is represented as a vector. Each dimension of the vector corresponds to a
|
5
5
|
# separate term. If a term occurs in the document, then the value in the vector is non-zero.
|
6
6
|
class Builder
|
7
|
+
attr_reader :parsed_document_cache
|
7
8
|
|
8
9
|
def initialize(options = {})
|
9
|
-
@parser = Parser.new(
|
10
|
+
@parser = Parser.new(
|
11
|
+
:filter_stop_words => options[:filter_stop_words],
|
12
|
+
:locale => options[:locale]
|
13
|
+
)
|
10
14
|
@parsed_document_cache = []
|
11
15
|
end
|
12
16
|
|
metadata
CHANGED
@@ -1,48 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rsemantic
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.3.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Joseph Wilk
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-03-04 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: gsl
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- - '
|
17
|
+
- - '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
19
|
+
version: '0'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- - '
|
24
|
+
- - '>='
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version:
|
26
|
+
version: '0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: fast-stemmer
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - '>='
|
36
32
|
- !ruby/object:Gem::Version
|
37
|
-
version:
|
33
|
+
version: '0'
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
|
-
version:
|
40
|
+
version: '0'
|
46
41
|
description: A document vector search with flexible matrix transforms. Currently supports
|
47
42
|
Latent semantic analysis and Term frequency - inverse document frequency
|
48
43
|
email:
|
@@ -54,21 +49,24 @@ extra_rdoc_files:
|
|
54
49
|
- README.md
|
55
50
|
- TODO.txt
|
56
51
|
files:
|
57
|
-
-
|
58
|
-
-
|
59
|
-
-
|
60
|
-
- lib/
|
61
|
-
- lib/
|
62
|
-
- lib/
|
63
|
-
- lib/
|
64
|
-
- lib/
|
65
|
-
- lib/
|
66
|
-
- lib/
|
67
|
-
- lib/
|
68
|
-
- lib/
|
69
|
-
- lib/
|
70
|
-
- lib/
|
71
|
-
- lib/
|
52
|
+
- History.txt
|
53
|
+
- README.md
|
54
|
+
- TODO.txt
|
55
|
+
- lib/rsemantic.rb
|
56
|
+
- lib/rsemantic/compare.rb
|
57
|
+
- lib/rsemantic/corpus.rb
|
58
|
+
- lib/rsemantic/document.rb
|
59
|
+
- lib/rsemantic/matrix_transformer.rb
|
60
|
+
- lib/rsemantic/parser.rb
|
61
|
+
- lib/rsemantic/search.rb
|
62
|
+
- lib/rsemantic/search_result.rb
|
63
|
+
- lib/rsemantic/transform.rb
|
64
|
+
- lib/rsemantic/transform/lsa_transform.rb
|
65
|
+
- lib/rsemantic/transform/tf_idf_transform.rb
|
66
|
+
- lib/rsemantic/vector_space.rb
|
67
|
+
- lib/rsemantic/vector_space/builder.rb
|
68
|
+
- lib/rsemantic/vector_space/model.rb
|
69
|
+
- lib/rsemantic/version.rb
|
72
70
|
- lib/tasks/rspec.rake
|
73
71
|
- resources/ar.stop
|
74
72
|
- resources/ca.stop
|
@@ -91,34 +89,30 @@ files:
|
|
91
89
|
- resources/ru.stop
|
92
90
|
- resources/sv.stop
|
93
91
|
- resources/tr.stop
|
94
|
-
- History.txt
|
95
|
-
- README.md
|
96
|
-
- TODO.txt
|
97
92
|
homepage: http://github.com/josephwilk/rsemantic
|
98
93
|
licenses:
|
99
94
|
- MIT
|
95
|
+
metadata: {}
|
100
96
|
post_install_message:
|
101
97
|
rdoc_options:
|
102
98
|
- --charset=UTF-8
|
103
99
|
require_paths:
|
104
100
|
- lib
|
105
101
|
required_ruby_version: !ruby/object:Gem::Requirement
|
106
|
-
none: false
|
107
102
|
requirements:
|
108
|
-
- -
|
103
|
+
- - '>='
|
109
104
|
- !ruby/object:Gem::Version
|
110
105
|
version: '0'
|
111
106
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
-
none: false
|
113
107
|
requirements:
|
114
|
-
- -
|
108
|
+
- - '>='
|
115
109
|
- !ruby/object:Gem::Version
|
116
110
|
version: '0'
|
117
111
|
requirements: []
|
118
112
|
rubyforge_project:
|
119
|
-
rubygems_version:
|
113
|
+
rubygems_version: 2.2.2
|
120
114
|
signing_key:
|
121
|
-
specification_version:
|
115
|
+
specification_version: 4
|
122
116
|
summary: A document vector search with flexible matrix transforms. Currently supports
|
123
117
|
Latent semantic analysis and Term frequency - inverse document frequency
|
124
118
|
test_files: []
|
data/lib/semantic/transform.rb
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
%w{tf_idf lsa}.each{|f| require "semantic/transform/#{f}_transform.rb"}
|
@@ -1 +0,0 @@
|
|
1
|
-
%w{model builder}.each{|f| require "semantic/vector_space/#{f}"}
|