rsemantic 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +6 -13
- data/lib/{semantic.rb → rsemantic.rb} +13 -14
- data/lib/{semantic → rsemantic}/compare.rb +1 -1
- data/lib/{semantic → rsemantic}/corpus.rb +4 -4
- data/lib/{semantic → rsemantic}/document.rb +1 -1
- data/lib/{semantic → rsemantic}/matrix_transformer.rb +6 -6
- data/lib/{semantic → rsemantic}/parser.rb +1 -1
- data/lib/{semantic → rsemantic}/search.rb +21 -4
- data/lib/{semantic → rsemantic}/search_result.rb +1 -1
- data/lib/rsemantic/transform.rb +1 -0
- data/lib/{semantic → rsemantic}/transform/lsa_transform.rb +6 -3
- data/lib/{semantic → rsemantic}/transform/tf_idf_transform.rb +1 -1
- data/lib/rsemantic/vector_space.rb +1 -0
- data/lib/{semantic → rsemantic}/vector_space/builder.rb +6 -2
- data/lib/{semantic → rsemantic}/vector_space/model.rb +1 -1
- data/lib/{semantic → rsemantic}/version.rb +3 -3
- metadata +33 -39
- data/lib/semantic/transform.rb +0 -1
- data/lib/semantic/vector_space.rb +0 -1
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b772bdf3866ef3155cb94364a88c1b2268c2ffe8
|
4
|
+
data.tar.gz: b9e832658a877a1b066ab77388780041fe74e9a0
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 30ed38b3a259d1dc5fd8398cb5747d6c396d270d37c9bb2b969c7a39be69a9dca70a4b3a1726df85b75782cf575c6752e0a173e10d353321e59cd4d8ff15a4ed
|
7
|
+
data.tar.gz: ffd419f02750472c80094c75eea45eea92e0c1bee9ca445cd7b8c79cf8f40bcbf7c6a9b698437c8465e7e72ada8f4fba8d4cc4d0057fe76cafb214f256b58bf6
|
data/README.md
CHANGED
@@ -12,31 +12,24 @@ Documentation: http://github.com/josephwilk/rsemantic/wikis/home
|
|
12
12
|
## Requirements:
|
13
13
|
|
14
14
|
* GSL - http://www.gnu.org/software/gsl
|
15
|
-
* stemmer - http://rubyforge.org/projects/stemmer/
|
16
15
|
|
17
16
|
## INSTALL:
|
18
17
|
|
19
|
-
|
20
|
-
With homebrew try this:
|
18
|
+
Rsemantic requires GSL. With homebrew try this:
|
21
19
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
brew tap homebrew/versions
|
27
|
-
brew install gsl114
|
28
|
-
bundle install
|
29
|
-
</code></pre>
|
20
|
+
```
|
21
|
+
brew install gsl
|
22
|
+
```
|
30
23
|
|
31
24
|
## Contributors
|
32
|
-
* @josephwilk
|
25
|
+
* [@josephwilk](http://blog.josephwilk.net)
|
33
26
|
* @dominikhonnef
|
34
27
|
|
35
28
|
## LICENSE
|
36
29
|
|
37
30
|
(The MIT License)
|
38
31
|
|
39
|
-
Copyright (c) 2008-
|
32
|
+
Copyright (c) 2008-2014 Joseph Wilk
|
40
33
|
|
41
34
|
Permission is hereby granted, free of charge, to any person obtaining
|
42
35
|
a copy of this software and associated documentation files (the
|
@@ -1,25 +1,24 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__)) unless
|
2
2
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
3
|
|
4
|
-
require "
|
5
|
-
require "
|
6
|
-
require "
|
7
|
-
require "
|
8
|
-
require "
|
9
|
-
require "
|
10
|
-
require "
|
11
|
-
|
12
|
-
require "
|
13
|
-
require "
|
14
|
-
require "
|
15
|
-
|
16
|
-
require 'rubygems'
|
4
|
+
require "rsemantic/vector_space"
|
5
|
+
require "rsemantic/compare"
|
6
|
+
require "rsemantic/parser"
|
7
|
+
require "rsemantic/matrix_transformer"
|
8
|
+
require "rsemantic/search"
|
9
|
+
require "rsemantic/transform"
|
10
|
+
require "rsemantic/version"
|
11
|
+
|
12
|
+
require "rsemantic/corpus"
|
13
|
+
require "rsemantic/document"
|
14
|
+
require "rsemantic/search_result"
|
15
|
+
|
17
16
|
require 'gsl'
|
18
17
|
|
19
18
|
require 'stemmer'
|
20
19
|
require 'logger'
|
21
20
|
|
22
|
-
module
|
21
|
+
module RSemantic
|
23
22
|
|
24
23
|
class << self
|
25
24
|
attr_writer :logger
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module RSemantic
|
2
2
|
class Corpus
|
3
3
|
# @return [Array<Document>]
|
4
4
|
attr_reader :documents
|
@@ -30,7 +30,7 @@ module Semantic
|
|
30
30
|
#
|
31
31
|
# @return [void]
|
32
32
|
def build_index
|
33
|
-
@search =
|
33
|
+
@search = RSemantic::Search.new(@documents.map(&:text), @options)
|
34
34
|
end
|
35
35
|
|
36
36
|
def search(*words)
|
@@ -38,14 +38,14 @@ module Semantic
|
|
38
38
|
results = @search.search(words)
|
39
39
|
results.map.with_index { |result, index|
|
40
40
|
document = @documents[index]
|
41
|
-
|
41
|
+
RSemantic::SearchResult.new(document, result)
|
42
42
|
}.sort
|
43
43
|
end
|
44
44
|
|
45
45
|
def find_related_document(document)
|
46
46
|
@search.related(@documents.index(document)).map.with_index { |result, index|
|
47
47
|
document = @documents[index]
|
48
|
-
|
48
|
+
RSemantic::SearchResult.new(document, result)
|
49
49
|
}.sort
|
50
50
|
end
|
51
51
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module RSemantic
|
2
2
|
class MatrixTransformer
|
3
3
|
|
4
4
|
def initialize(transforms)
|
@@ -8,13 +8,13 @@ module Semantic
|
|
8
8
|
def apply_transforms(vector_space_model)
|
9
9
|
@transforms.each do |transform|
|
10
10
|
begin
|
11
|
-
transform_class =
|
12
|
-
|
11
|
+
transform_class = RSemantic::Transform.const_get(transform)
|
12
|
+
RSemantic.logger.info("Applying #{transform} transform")
|
13
13
|
transform_class.transform!(vector_space_model.matrix)
|
14
|
-
|
14
|
+
RSemantic.logger.info(vector_space_model)
|
15
15
|
rescue => e
|
16
|
-
|
17
|
-
|
16
|
+
RSemantic.logger.error("Error: Cannot perform transform: #{transform}")
|
17
|
+
RSemantic.logger.error(e)
|
18
18
|
end
|
19
19
|
end
|
20
20
|
vector_space_model
|
@@ -1,5 +1,6 @@
|
|
1
|
-
module
|
1
|
+
module RSemantic
|
2
2
|
class Search
|
3
|
+
attr_reader :builder
|
3
4
|
|
4
5
|
def initialize(documents, options = {})
|
5
6
|
options = {
|
@@ -8,15 +9,19 @@ module Semantic
|
|
8
9
|
:filter_stop_words => true,
|
9
10
|
:stem_words => true,
|
10
11
|
}.merge(options)
|
11
|
-
|
12
|
+
RSemantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
|
12
13
|
|
13
14
|
|
14
|
-
@builder = VectorSpace::Builder.new(
|
15
|
+
@builder = VectorSpace::Builder.new(
|
16
|
+
:filter_stop_words => options[:filter_stop_words],
|
17
|
+
:stem_words => options[:stem_words],
|
18
|
+
:locale => options[:locale]
|
19
|
+
)
|
15
20
|
@matrix_transformer = MatrixTransformer.new(options[:transforms])
|
16
21
|
|
17
22
|
@vector_space_model = @builder.build_document_matrix(documents)
|
18
23
|
|
19
|
-
|
24
|
+
RSemantic.logger.info(@vector_space_model)
|
20
25
|
|
21
26
|
@vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
|
22
27
|
end
|
@@ -37,5 +42,17 @@ module Semantic
|
|
37
42
|
end
|
38
43
|
ratings
|
39
44
|
end
|
45
|
+
|
46
|
+
protected
|
47
|
+
|
48
|
+
def marshal_dump
|
49
|
+
[@builder, @matrix_transformer, @vector_space_model.to_a]
|
50
|
+
end
|
51
|
+
|
52
|
+
def marshal_load(array)
|
53
|
+
@builder = array.shift
|
54
|
+
@matrix_transformer = array.shift
|
55
|
+
@vector_space_model = GSL::Matrix.alloc(*array.shift)
|
56
|
+
end
|
40
57
|
end
|
41
58
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
%w{tf_idf lsa}.each{|f| require "rsemantic/transform/#{f}_transform.rb"}
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module
|
1
|
+
module RSemantic
|
2
2
|
module Transform
|
3
3
|
class LSA
|
4
4
|
|
@@ -8,11 +8,14 @@ module Semantic
|
|
8
8
|
# TODO configurable rank
|
9
9
|
columns = matrix.size2
|
10
10
|
|
11
|
-
|
11
|
+
# if M < N perform SVD on transponsed matrix
|
12
|
+
matrix.size1 < matrix.size2 ? (u, v, sigma = matrix.transpose.SV_decomp_mod) : (u, v, sigma = matrix.SV_decomp_mod)
|
13
|
+
|
12
14
|
reduce_dimensions!(sigma, rank)
|
13
15
|
sigma = GSL::Matrix.diagonal(sigma)
|
14
16
|
|
15
|
-
|
17
|
+
# if M < N return transposed result
|
18
|
+
matrix.size1 < matrix.size2 ? GSL::Matrix.swap(matrix, (u * sigma * v.transpose).transpose) : GSL::Matrix.swap(matrix, u * sigma * v.transpose)
|
16
19
|
end
|
17
20
|
|
18
21
|
private
|
@@ -0,0 +1 @@
|
|
1
|
+
%w{model builder}.each{|f| require "rsemantic/vector_space/#{f}"}
|
@@ -1,12 +1,16 @@
|
|
1
|
-
module
|
1
|
+
module RSemantic
|
2
2
|
module VectorSpace
|
3
3
|
# A algebraic model for representing text documents as vectors of identifiers.
|
4
4
|
# A document is represented as a vector. Each dimension of the vector corresponds to a
|
5
5
|
# separate term. If a term occurs in the document, then the value in the vector is non-zero.
|
6
6
|
class Builder
|
7
|
+
attr_reader :parsed_document_cache
|
7
8
|
|
8
9
|
def initialize(options = {})
|
9
|
-
@parser = Parser.new(
|
10
|
+
@parser = Parser.new(
|
11
|
+
:filter_stop_words => options[:filter_stop_words],
|
12
|
+
:locale => options[:locale]
|
13
|
+
)
|
10
14
|
@parsed_document_cache = []
|
11
15
|
end
|
12
16
|
|
metadata
CHANGED
@@ -1,48 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rsemantic
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.3.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Joseph Wilk
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-03-04 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: gsl
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- - '
|
17
|
+
- - '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
19
|
+
version: '0'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- - '
|
24
|
+
- - '>='
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version:
|
26
|
+
version: '0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: fast-stemmer
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - '>='
|
36
32
|
- !ruby/object:Gem::Version
|
37
|
-
version:
|
33
|
+
version: '0'
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
|
-
version:
|
40
|
+
version: '0'
|
46
41
|
description: A document vector search with flexible matrix transforms. Currently supports
|
47
42
|
Latent semantic analysis and Term frequency - inverse document frequency
|
48
43
|
email:
|
@@ -54,21 +49,24 @@ extra_rdoc_files:
|
|
54
49
|
- README.md
|
55
50
|
- TODO.txt
|
56
51
|
files:
|
57
|
-
-
|
58
|
-
-
|
59
|
-
-
|
60
|
-
- lib/
|
61
|
-
- lib/
|
62
|
-
- lib/
|
63
|
-
- lib/
|
64
|
-
- lib/
|
65
|
-
- lib/
|
66
|
-
- lib/
|
67
|
-
- lib/
|
68
|
-
- lib/
|
69
|
-
- lib/
|
70
|
-
- lib/
|
71
|
-
- lib/
|
52
|
+
- History.txt
|
53
|
+
- README.md
|
54
|
+
- TODO.txt
|
55
|
+
- lib/rsemantic.rb
|
56
|
+
- lib/rsemantic/compare.rb
|
57
|
+
- lib/rsemantic/corpus.rb
|
58
|
+
- lib/rsemantic/document.rb
|
59
|
+
- lib/rsemantic/matrix_transformer.rb
|
60
|
+
- lib/rsemantic/parser.rb
|
61
|
+
- lib/rsemantic/search.rb
|
62
|
+
- lib/rsemantic/search_result.rb
|
63
|
+
- lib/rsemantic/transform.rb
|
64
|
+
- lib/rsemantic/transform/lsa_transform.rb
|
65
|
+
- lib/rsemantic/transform/tf_idf_transform.rb
|
66
|
+
- lib/rsemantic/vector_space.rb
|
67
|
+
- lib/rsemantic/vector_space/builder.rb
|
68
|
+
- lib/rsemantic/vector_space/model.rb
|
69
|
+
- lib/rsemantic/version.rb
|
72
70
|
- lib/tasks/rspec.rake
|
73
71
|
- resources/ar.stop
|
74
72
|
- resources/ca.stop
|
@@ -91,34 +89,30 @@ files:
|
|
91
89
|
- resources/ru.stop
|
92
90
|
- resources/sv.stop
|
93
91
|
- resources/tr.stop
|
94
|
-
- History.txt
|
95
|
-
- README.md
|
96
|
-
- TODO.txt
|
97
92
|
homepage: http://github.com/josephwilk/rsemantic
|
98
93
|
licenses:
|
99
94
|
- MIT
|
95
|
+
metadata: {}
|
100
96
|
post_install_message:
|
101
97
|
rdoc_options:
|
102
98
|
- --charset=UTF-8
|
103
99
|
require_paths:
|
104
100
|
- lib
|
105
101
|
required_ruby_version: !ruby/object:Gem::Requirement
|
106
|
-
none: false
|
107
102
|
requirements:
|
108
|
-
- -
|
103
|
+
- - '>='
|
109
104
|
- !ruby/object:Gem::Version
|
110
105
|
version: '0'
|
111
106
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
-
none: false
|
113
107
|
requirements:
|
114
|
-
- -
|
108
|
+
- - '>='
|
115
109
|
- !ruby/object:Gem::Version
|
116
110
|
version: '0'
|
117
111
|
requirements: []
|
118
112
|
rubyforge_project:
|
119
|
-
rubygems_version:
|
113
|
+
rubygems_version: 2.2.2
|
120
114
|
signing_key:
|
121
|
-
specification_version:
|
115
|
+
specification_version: 4
|
122
116
|
summary: A document vector search with flexible matrix transforms. Currently supports
|
123
117
|
Latent semantic analysis and Term frequency - inverse document frequency
|
124
118
|
test_files: []
|
data/lib/semantic/transform.rb
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
%w{tf_idf lsa}.each{|f| require "semantic/transform/#{f}_transform.rb"}
|
@@ -1 +0,0 @@
|
|
1
|
-
%w{model builder}.each{|f| require "semantic/vector_space/#{f}"}
|