rsemantic 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +6 -3
- data/TODO.txt +5 -2
- data/lib/semantic/parser.rb +5 -3
- data/lib/semantic/transform/tf_idf_transform.rb +2 -5
- data/lib/semantic/vector_space/builder.rb +1 -1
- data/lib/semantic/vector_space/model.rb +9 -3
- data/lib/semantic/version.rb +2 -2
- data/lib/tasks/rspec.rake +5 -1
- data/resources/ar.stop +162 -0
- data/resources/ca.stop +124 -0
- data/resources/cs.stop +136 -0
- data/resources/da.stop +99 -0
- data/resources/de.stop +996 -0
- data/resources/el.stop +77 -0
- data/resources/{english.stop → en.stop} +0 -0
- data/resources/es.stop +176 -0
- data/resources/fi.stop +747 -0
- data/resources/fr.stop +124 -0
- data/resources/hu.stop +33 -0
- data/resources/id.stop +329 -0
- data/resources/it.stop +132 -0
- data/resources/ja.stop +44 -0
- data/resources/nl.stop +46 -0
- data/resources/no.stop +117 -0
- data/resources/pl.stop +138 -0
- data/resources/pt.stop +145 -0
- data/resources/ru.stop +421 -0
- data/resources/sv.stop +386 -0
- data/resources/tr.stop +112 -0
- metadata +24 -4
data/README.md
CHANGED
@@ -17,11 +17,14 @@ Documentation: http://github.com/josephwilk/rsemantic/wikis/home
|
|
17
17
|
## INSTALL:
|
18
18
|
|
19
19
|
Note 'brew install GSL' installs 1.15 which is not supported yet by the gsl gem. So you have to switch your GSL version to 1.14.
|
20
|
-
|
20
|
+
With homebrew try this:
|
21
21
|
|
22
|
-
<pre><code>
|
22
|
+
<pre><code>
|
23
|
+
git clone git://github.com/josephwilk/rsemantic.git
|
23
24
|
cd rsemantic
|
24
|
-
|
25
|
+
|
26
|
+
brew tap homebrew/versions
|
27
|
+
brew install gsl114
|
25
28
|
bundle install
|
26
29
|
</code></pre>
|
27
30
|
|
data/TODO.txt
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
== FEATURES/PROBLEMS:
|
2
2
|
|
3
3
|
* Applying transforms to query vectors
|
4
|
-
* Detect the optimal dimension reduction in LSA.
|
5
4
|
* Allow objects to be passed in as transforms.
|
5
|
+
* Hashes might be enough, but a faster data structure might be a good option.
|
6
|
+
* Detect the optimal dimension reduction in LSA.
|
7
|
+
* This needs some benchmark. Low number of dimensions can be effective enough.
|
8
|
+
* http://nlp.stanford.edu/IR-book/html/htmledition/latent-semantic-indexing-1.html
|
6
9
|
* Implement Probabilistic latent semantic analysis
|
7
10
|
* Implement Latent Dirichlet Allocation
|
8
11
|
|
9
|
-
* Matrix transformer has to popout the matrix of VectorSpace::Model and reassign it, get rid of this.
|
12
|
+
* Matrix transformer has to popout the matrix of VectorSpace::Model and reassign it, get rid of this.
|
data/lib/semantic/parser.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require 'stemmer'
|
2
1
|
require "set"
|
2
|
+
|
3
3
|
module Semantic
|
4
4
|
class Parser
|
5
5
|
|
@@ -8,8 +8,10 @@ module Semantic
|
|
8
8
|
# TODO: nicer way to reference stop file location?
|
9
9
|
@filter_stop_words = options[:filter_stop_words]
|
10
10
|
@stem_words = options[:stem_words]
|
11
|
+
locale = options[:locale] || 'en'
|
12
|
+
|
11
13
|
if @filter_stop_words
|
12
|
-
File.open(File.dirname(__FILE__)
|
14
|
+
File.open("#{File.dirname(__FILE__)}/../../resources/#{locale}.stop", 'r') do |file|
|
13
15
|
@stopwords = Set.new(file.read().split())
|
14
16
|
end
|
15
17
|
end
|
@@ -42,7 +44,7 @@ module Semantic
|
|
42
44
|
words = string.split(" ")
|
43
45
|
|
44
46
|
if @stem_words
|
45
|
-
words.map
|
47
|
+
words.map(&:stem)
|
46
48
|
else
|
47
49
|
words
|
48
50
|
end
|
@@ -8,13 +8,10 @@ module Semantic
|
|
8
8
|
@@number_of_documents_with_term = []
|
9
9
|
|
10
10
|
matrix.transpose.enum_for(:each_row).with_index do |document, column_index|
|
11
|
-
document_term_total = document.sum
|
12
|
-
|
13
11
|
document.enum_for(:each).with_index do |term_weight, row_index|
|
14
12
|
unless term_weight == 0.0
|
15
|
-
inverse_document_frequency = GSL::Sf.log(
|
16
|
-
|
17
|
-
term_frequency = (term_weight / document_term_total)
|
13
|
+
inverse_document_frequency = 1 + GSL::Sf.log(number_of_documents / (number_of_documents_with_term(row_index, matrix).to_f + 1))
|
14
|
+
term_frequency = Math.sqrt(term_weight)
|
18
15
|
|
19
16
|
matrix[row_index, column_index] = term_frequency * inverse_document_frequency
|
20
17
|
end
|
@@ -22,7 +22,7 @@ module Semantic
|
|
22
22
|
|
23
23
|
def to_s
|
24
24
|
out = StringIO.new
|
25
|
-
out.print " " *
|
25
|
+
out.print " " * 12
|
26
26
|
|
27
27
|
matrix.size2.times do |id|
|
28
28
|
out.print " D#{id+1} "
|
@@ -30,8 +30,13 @@ module Semantic
|
|
30
30
|
out.puts
|
31
31
|
|
32
32
|
matrix.to_a.each_with_index do |terms, index|
|
33
|
-
|
34
|
-
|
33
|
+
|
34
|
+
if @keywords.has_value?(index)
|
35
|
+
index_position = @keywords.values.index(index)
|
36
|
+
key = @keywords.keys[index_position]
|
37
|
+
|
38
|
+
out.print "#{key.ljust(10)}"
|
39
|
+
end
|
35
40
|
out.print "[ "
|
36
41
|
|
37
42
|
terms.each do |document|
|
@@ -40,6 +45,7 @@ module Semantic
|
|
40
45
|
out.print "]"
|
41
46
|
out.puts
|
42
47
|
end
|
48
|
+
|
43
49
|
out.string
|
44
50
|
end
|
45
51
|
|
data/lib/semantic/version.rb
CHANGED
data/lib/tasks/rspec.rake
CHANGED
@@ -1,11 +1,15 @@
|
|
1
1
|
require 'rspec/core/rake_task'
|
2
2
|
|
3
3
|
desc 'Default: run specs.'
|
4
|
-
task :default => :spec
|
4
|
+
task :default => [:spec, :integration]
|
5
5
|
|
6
6
|
desc "Run specs"
|
7
7
|
RSpec::Core::RakeTask.new
|
8
8
|
|
9
|
+
RSpec::Core::RakeTask.new(:integration) do |t|
|
10
|
+
t.pattern = 'spec_integration/*_spec.rb'
|
11
|
+
end
|
12
|
+
|
9
13
|
desc "Generate code coverage"
|
10
14
|
RSpec::Core::RakeTask.new(:coverage) do |t|
|
11
15
|
t.rcov = true
|
data/resources/ar.stop
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
ب
|
2
|
+
ا
|
3
|
+
أ
|
4
|
+
،
|
5
|
+
عشر
|
6
|
+
عدد
|
7
|
+
عدة
|
8
|
+
عشرة
|
9
|
+
عدم
|
10
|
+
عام
|
11
|
+
عاما
|
12
|
+
عن
|
13
|
+
عند
|
14
|
+
عندما
|
15
|
+
على
|
16
|
+
عليه
|
17
|
+
عليها
|
18
|
+
زيارة
|
19
|
+
سنة
|
20
|
+
سنوات
|
21
|
+
تم
|
22
|
+
ضد
|
23
|
+
بعد
|
24
|
+
بعض
|
25
|
+
اعادة
|
26
|
+
اعلنت
|
27
|
+
بسبب
|
28
|
+
حتى
|
29
|
+
اذا
|
30
|
+
احد
|
31
|
+
اثر
|
32
|
+
برس
|
33
|
+
باسم
|
34
|
+
غدا
|
35
|
+
شخصا
|
36
|
+
صباح
|
37
|
+
اطار
|
38
|
+
اربعة
|
39
|
+
اخرى
|
40
|
+
بان
|
41
|
+
اجل
|
42
|
+
غير
|
43
|
+
بشكل
|
44
|
+
حاليا
|
45
|
+
بن
|
46
|
+
به
|
47
|
+
ثم
|
48
|
+
اف
|
49
|
+
ان
|
50
|
+
او
|
51
|
+
اي
|
52
|
+
بها
|
53
|
+
صفر
|
54
|
+
حيث
|
55
|
+
اكد
|
56
|
+
الا
|
57
|
+
اما
|
58
|
+
امس
|
59
|
+
السابق
|
60
|
+
التى
|
61
|
+
التي
|
62
|
+
اكثر
|
63
|
+
ايار
|
64
|
+
ايضا
|
65
|
+
ثلاثة
|
66
|
+
الذاتي
|
67
|
+
الاخيرة
|
68
|
+
الثاني
|
69
|
+
الثانية
|
70
|
+
الذى
|
71
|
+
الذي
|
72
|
+
الان
|
73
|
+
امام
|
74
|
+
ايام
|
75
|
+
خلال
|
76
|
+
حوالى
|
77
|
+
الذين
|
78
|
+
الاول
|
79
|
+
الاولى
|
80
|
+
بين
|
81
|
+
ذلك
|
82
|
+
دون
|
83
|
+
حول
|
84
|
+
حين
|
85
|
+
الف
|
86
|
+
الى
|
87
|
+
انه
|
88
|
+
اول
|
89
|
+
ضمن
|
90
|
+
انها
|
91
|
+
جميع
|
92
|
+
الماضي
|
93
|
+
الوقت
|
94
|
+
المقبل
|
95
|
+
اليوم
|
96
|
+
ـ
|
97
|
+
ف
|
98
|
+
و
|
99
|
+
و6
|
100
|
+
قد
|
101
|
+
لا
|
102
|
+
ما
|
103
|
+
مع
|
104
|
+
مساء
|
105
|
+
هذا
|
106
|
+
واحد
|
107
|
+
واضاف
|
108
|
+
واضافت
|
109
|
+
فان
|
110
|
+
قبل
|
111
|
+
قال
|
112
|
+
كان
|
113
|
+
لدى
|
114
|
+
نحو
|
115
|
+
هذه
|
116
|
+
وان
|
117
|
+
واكد
|
118
|
+
كانت
|
119
|
+
واوضح
|
120
|
+
مايو
|
121
|
+
فى
|
122
|
+
في
|
123
|
+
كل
|
124
|
+
لم
|
125
|
+
لن
|
126
|
+
له
|
127
|
+
من
|
128
|
+
هو
|
129
|
+
هي
|
130
|
+
قوة
|
131
|
+
كما
|
132
|
+
لها
|
133
|
+
منذ
|
134
|
+
وقد
|
135
|
+
ولا
|
136
|
+
نفسه
|
137
|
+
لقاء
|
138
|
+
مقابل
|
139
|
+
هناك
|
140
|
+
وقال
|
141
|
+
وكان
|
142
|
+
نهاية
|
143
|
+
وقالت
|
144
|
+
وكانت
|
145
|
+
للامم
|
146
|
+
فيه
|
147
|
+
كلم
|
148
|
+
لكن
|
149
|
+
وفي
|
150
|
+
وقف
|
151
|
+
ولم
|
152
|
+
ومن
|
153
|
+
وهو
|
154
|
+
وهي
|
155
|
+
يوم
|
156
|
+
فيها
|
157
|
+
منها
|
158
|
+
مليار
|
159
|
+
لوكالة
|
160
|
+
يكون
|
161
|
+
يمكن
|
162
|
+
مليون
|
data/resources/ca.stop
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
de
|
2
|
+
es
|
3
|
+
i
|
4
|
+
a
|
5
|
+
o
|
6
|
+
un
|
7
|
+
una
|
8
|
+
unes
|
9
|
+
uns
|
10
|
+
un
|
11
|
+
tot
|
12
|
+
també
|
13
|
+
altre
|
14
|
+
algun
|
15
|
+
alguna
|
16
|
+
alguns
|
17
|
+
algunes
|
18
|
+
ser
|
19
|
+
és
|
20
|
+
soc
|
21
|
+
ets
|
22
|
+
som
|
23
|
+
estic
|
24
|
+
està
|
25
|
+
estem
|
26
|
+
esteu
|
27
|
+
estan
|
28
|
+
com
|
29
|
+
en
|
30
|
+
per
|
31
|
+
perquè
|
32
|
+
per que
|
33
|
+
estat
|
34
|
+
estava
|
35
|
+
ans
|
36
|
+
abans
|
37
|
+
éssent
|
38
|
+
ambdós
|
39
|
+
però
|
40
|
+
per
|
41
|
+
poder
|
42
|
+
potser
|
43
|
+
puc
|
44
|
+
podem
|
45
|
+
podeu
|
46
|
+
poden
|
47
|
+
vaig
|
48
|
+
va
|
49
|
+
van
|
50
|
+
fer faig
|
51
|
+
fa
|
52
|
+
fem
|
53
|
+
feu
|
54
|
+
fan
|
55
|
+
cada
|
56
|
+
fi
|
57
|
+
inclòs
|
58
|
+
primer
|
59
|
+
des de
|
60
|
+
conseguir
|
61
|
+
consegueixo
|
62
|
+
consigueix
|
63
|
+
consigueixes
|
64
|
+
conseguim
|
65
|
+
consigueixen
|
66
|
+
anar
|
67
|
+
haver
|
68
|
+
tenir
|
69
|
+
tinc
|
70
|
+
te
|
71
|
+
tenim
|
72
|
+
teniu
|
73
|
+
tene
|
74
|
+
el
|
75
|
+
la
|
76
|
+
les
|
77
|
+
els
|
78
|
+
seu
|
79
|
+
aquí
|
80
|
+
meu
|
81
|
+
teu
|
82
|
+
ells
|
83
|
+
elles
|
84
|
+
ens
|
85
|
+
nosaltres
|
86
|
+
vosaltres
|
87
|
+
si
|
88
|
+
dins
|
89
|
+
sols
|
90
|
+
solament
|
91
|
+
saber
|
92
|
+
saps
|
93
|
+
sap
|
94
|
+
sabem
|
95
|
+
sabeu
|
96
|
+
saben
|
97
|
+
últim
|
98
|
+
llarg
|
99
|
+
bastant fas
|
100
|
+
molts
|
101
|
+
aquells
|
102
|
+
aquelles
|
103
|
+
seus
|
104
|
+
llavors
|
105
|
+
sota
|
106
|
+
dalt
|
107
|
+
ús
|
108
|
+
molt
|
109
|
+
era
|
110
|
+
eres
|
111
|
+
erem
|
112
|
+
eren
|
113
|
+
mode
|
114
|
+
bé
|
115
|
+
quant
|
116
|
+
quan
|
117
|
+
on
|
118
|
+
mentre
|
119
|
+
qui
|
120
|
+
amb
|
121
|
+
entre
|
122
|
+
sense
|
123
|
+
jo
|
124
|
+
aquell
|
data/resources/cs.stop
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
dnes
|
2
|
+
cz
|
3
|
+
timto
|
4
|
+
budes
|
5
|
+
budem
|
6
|
+
byli
|
7
|
+
jses
|
8
|
+
muj
|
9
|
+
svym
|
10
|
+
ta
|
11
|
+
tomto
|
12
|
+
tohle
|
13
|
+
tuto
|
14
|
+
tyto
|
15
|
+
jej
|
16
|
+
zda
|
17
|
+
proc
|
18
|
+
mate
|
19
|
+
tato
|
20
|
+
kam
|
21
|
+
tohoto
|
22
|
+
kdo
|
23
|
+
kteri
|
24
|
+
mi
|
25
|
+
nam
|
26
|
+
tom
|
27
|
+
tomuto
|
28
|
+
mit
|
29
|
+
nic
|
30
|
+
proto
|
31
|
+
kterou
|
32
|
+
byla
|
33
|
+
toho
|
34
|
+
protoze
|
35
|
+
asi
|
36
|
+
ho
|
37
|
+
nasi
|
38
|
+
napiste
|
39
|
+
re
|
40
|
+
coz
|
41
|
+
tim
|
42
|
+
takze
|
43
|
+
svych
|
44
|
+
jeji
|
45
|
+
svymi
|
46
|
+
jste
|
47
|
+
aj
|
48
|
+
tu
|
49
|
+
tedy
|
50
|
+
teto bylo
|
51
|
+
kde
|
52
|
+
ke
|
53
|
+
prave
|
54
|
+
ji
|
55
|
+
nad
|
56
|
+
nejsou
|
57
|
+
ci
|
58
|
+
pod
|
59
|
+
tema
|
60
|
+
mezi
|
61
|
+
pres
|
62
|
+
ty
|
63
|
+
pak
|
64
|
+
vam
|
65
|
+
ani
|
66
|
+
kdyz
|
67
|
+
vsak
|
68
|
+
ne
|
69
|
+
jsem
|
70
|
+
tento
|
71
|
+
clanku
|
72
|
+
clanky
|
73
|
+
aby
|
74
|
+
jsme
|
75
|
+
pred
|
76
|
+
pta
|
77
|
+
jejich
|
78
|
+
byl
|
79
|
+
jeste
|
80
|
+
az
|
81
|
+
bez
|
82
|
+
take
|
83
|
+
pouze
|
84
|
+
prvni
|
85
|
+
vase
|
86
|
+
ktera
|
87
|
+
nas
|
88
|
+
novy
|
89
|
+
tipy
|
90
|
+
pokud
|
91
|
+
muze
|
92
|
+
design
|
93
|
+
strana
|
94
|
+
jeho
|
95
|
+
sve
|
96
|
+
jine
|
97
|
+
zpravy
|
98
|
+
nove
|
99
|
+
neni vas
|
100
|
+
jen
|
101
|
+
podle
|
102
|
+
zde
|
103
|
+
clanek
|
104
|
+
uz
|
105
|
+
email
|
106
|
+
byt
|
107
|
+
vice
|
108
|
+
bude
|
109
|
+
jiz
|
110
|
+
nez
|
111
|
+
ktery
|
112
|
+
by
|
113
|
+
ktere
|
114
|
+
co
|
115
|
+
nebo
|
116
|
+
ten
|
117
|
+
tak
|
118
|
+
ma
|
119
|
+
pri
|
120
|
+
od
|
121
|
+
po
|
122
|
+
jsou
|
123
|
+
jak
|
124
|
+
dalsi
|
125
|
+
ale
|
126
|
+
si
|
127
|
+
ve
|
128
|
+
to
|
129
|
+
jako
|
130
|
+
za
|
131
|
+
zpet
|
132
|
+
ze
|
133
|
+
do
|
134
|
+
pro
|
135
|
+
je
|
136
|
+
na
|