rsemantic 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +6 -3
- data/TODO.txt +5 -2
- data/lib/semantic/parser.rb +5 -3
- data/lib/semantic/transform/tf_idf_transform.rb +2 -5
- data/lib/semantic/vector_space/builder.rb +1 -1
- data/lib/semantic/vector_space/model.rb +9 -3
- data/lib/semantic/version.rb +2 -2
- data/lib/tasks/rspec.rake +5 -1
- data/resources/ar.stop +162 -0
- data/resources/ca.stop +124 -0
- data/resources/cs.stop +136 -0
- data/resources/da.stop +99 -0
- data/resources/de.stop +996 -0
- data/resources/el.stop +77 -0
- data/resources/{english.stop → en.stop} +0 -0
- data/resources/es.stop +176 -0
- data/resources/fi.stop +747 -0
- data/resources/fr.stop +124 -0
- data/resources/hu.stop +33 -0
- data/resources/id.stop +329 -0
- data/resources/it.stop +132 -0
- data/resources/ja.stop +44 -0
- data/resources/nl.stop +46 -0
- data/resources/no.stop +117 -0
- data/resources/pl.stop +138 -0
- data/resources/pt.stop +145 -0
- data/resources/ru.stop +421 -0
- data/resources/sv.stop +386 -0
- data/resources/tr.stop +112 -0
- metadata +24 -4
data/README.md
CHANGED
@@ -17,11 +17,14 @@ Documentation: http://github.com/josephwilk/rsemantic/wikis/home
|
|
17
17
|
## INSTALL:
|
18
18
|
|
19
19
|
Note 'brew install GSL' installs 1.15 which is not supported yet by the gsl gem. So you have to switch your GSL version to 1.14.
|
20
|
-
|
20
|
+
With homebrew try this:
|
21
21
|
|
22
|
-
<pre><code>
|
22
|
+
<pre><code>
|
23
|
+
git clone git://github.com/josephwilk/rsemantic.git
|
23
24
|
cd rsemantic
|
24
|
-
|
25
|
+
|
26
|
+
brew tap homebrew/versions
|
27
|
+
brew install gsl114
|
25
28
|
bundle install
|
26
29
|
</code></pre>
|
27
30
|
|
data/TODO.txt
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
== FEATURES/PROBLEMS:
|
2
2
|
|
3
3
|
* Applying transforms to query vectors
|
4
|
-
* Detect the optimal dimension reduction in LSA.
|
5
4
|
* Allow objects to be passed in as transforms.
|
5
|
+
* Hashes might be enough, but a faster data structure might be a good option.
|
6
|
+
* Detect the optimal dimension reduction in LSA.
|
7
|
+
* This needs some benchmark. Low number of dimensions can be effective enough.
|
8
|
+
* http://nlp.stanford.edu/IR-book/html/htmledition/latent-semantic-indexing-1.html
|
6
9
|
* Implement Probabilistic latent semantic analysis
|
7
10
|
* Implement Latent Dirichlet Allocation
|
8
11
|
|
9
|
-
* Matrix transformer has to popout the matrix of VectorSpace::Model and reassign it, get rid of this.
|
12
|
+
* Matrix transformer has to popout the matrix of VectorSpace::Model and reassign it, get rid of this.
|
data/lib/semantic/parser.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
require 'stemmer'
|
2
1
|
require "set"
|
2
|
+
|
3
3
|
module Semantic
|
4
4
|
class Parser
|
5
5
|
|
@@ -8,8 +8,10 @@ module Semantic
|
|
8
8
|
# TODO: nicer way to reference stop file location?
|
9
9
|
@filter_stop_words = options[:filter_stop_words]
|
10
10
|
@stem_words = options[:stem_words]
|
11
|
+
locale = options[:locale] || 'en'
|
12
|
+
|
11
13
|
if @filter_stop_words
|
12
|
-
File.open(File.dirname(__FILE__)
|
14
|
+
File.open("#{File.dirname(__FILE__)}/../../resources/#{locale}.stop", 'r') do |file|
|
13
15
|
@stopwords = Set.new(file.read().split())
|
14
16
|
end
|
15
17
|
end
|
@@ -42,7 +44,7 @@ module Semantic
|
|
42
44
|
words = string.split(" ")
|
43
45
|
|
44
46
|
if @stem_words
|
45
|
-
words.map
|
47
|
+
words.map(&:stem)
|
46
48
|
else
|
47
49
|
words
|
48
50
|
end
|
@@ -8,13 +8,10 @@ module Semantic
|
|
8
8
|
@@number_of_documents_with_term = []
|
9
9
|
|
10
10
|
matrix.transpose.enum_for(:each_row).with_index do |document, column_index|
|
11
|
-
document_term_total = document.sum
|
12
|
-
|
13
11
|
document.enum_for(:each).with_index do |term_weight, row_index|
|
14
12
|
unless term_weight == 0.0
|
15
|
-
inverse_document_frequency = GSL::Sf.log(
|
16
|
-
|
17
|
-
term_frequency = (term_weight / document_term_total)
|
13
|
+
inverse_document_frequency = 1 + GSL::Sf.log(number_of_documents / (number_of_documents_with_term(row_index, matrix).to_f + 1))
|
14
|
+
term_frequency = Math.sqrt(term_weight)
|
18
15
|
|
19
16
|
matrix[row_index, column_index] = term_frequency * inverse_document_frequency
|
20
17
|
end
|
@@ -22,7 +22,7 @@ module Semantic
|
|
22
22
|
|
23
23
|
def to_s
|
24
24
|
out = StringIO.new
|
25
|
-
out.print " " *
|
25
|
+
out.print " " * 12
|
26
26
|
|
27
27
|
matrix.size2.times do |id|
|
28
28
|
out.print " D#{id+1} "
|
@@ -30,8 +30,13 @@ module Semantic
|
|
30
30
|
out.puts
|
31
31
|
|
32
32
|
matrix.to_a.each_with_index do |terms, index|
|
33
|
-
|
34
|
-
|
33
|
+
|
34
|
+
if @keywords.has_value?(index)
|
35
|
+
index_position = @keywords.values.index(index)
|
36
|
+
key = @keywords.keys[index_position]
|
37
|
+
|
38
|
+
out.print "#{key.ljust(10)}"
|
39
|
+
end
|
35
40
|
out.print "[ "
|
36
41
|
|
37
42
|
terms.each do |document|
|
@@ -40,6 +45,7 @@ module Semantic
|
|
40
45
|
out.print "]"
|
41
46
|
out.puts
|
42
47
|
end
|
48
|
+
|
43
49
|
out.string
|
44
50
|
end
|
45
51
|
|
data/lib/semantic/version.rb
CHANGED
data/lib/tasks/rspec.rake
CHANGED
@@ -1,11 +1,15 @@
|
|
1
1
|
require 'rspec/core/rake_task'
|
2
2
|
|
3
3
|
desc 'Default: run specs.'
|
4
|
-
task :default => :spec
|
4
|
+
task :default => [:spec, :integration]
|
5
5
|
|
6
6
|
desc "Run specs"
|
7
7
|
RSpec::Core::RakeTask.new
|
8
8
|
|
9
|
+
RSpec::Core::RakeTask.new(:integration) do |t|
|
10
|
+
t.pattern = 'spec_integration/*_spec.rb'
|
11
|
+
end
|
12
|
+
|
9
13
|
desc "Generate code coverage"
|
10
14
|
RSpec::Core::RakeTask.new(:coverage) do |t|
|
11
15
|
t.rcov = true
|
data/resources/ar.stop
ADDED
@@ -0,0 +1,162 @@
|
|
1
|
+
ب
|
2
|
+
ا
|
3
|
+
أ
|
4
|
+
،
|
5
|
+
عشر
|
6
|
+
عدد
|
7
|
+
عدة
|
8
|
+
عشرة
|
9
|
+
عدم
|
10
|
+
عام
|
11
|
+
عاما
|
12
|
+
عن
|
13
|
+
عند
|
14
|
+
عندما
|
15
|
+
على
|
16
|
+
عليه
|
17
|
+
عليها
|
18
|
+
زيارة
|
19
|
+
سنة
|
20
|
+
سنوات
|
21
|
+
تم
|
22
|
+
ضد
|
23
|
+
بعد
|
24
|
+
بعض
|
25
|
+
اعادة
|
26
|
+
اعلنت
|
27
|
+
بسبب
|
28
|
+
حتى
|
29
|
+
اذا
|
30
|
+
احد
|
31
|
+
اثر
|
32
|
+
برس
|
33
|
+
باسم
|
34
|
+
غدا
|
35
|
+
شخصا
|
36
|
+
صباح
|
37
|
+
اطار
|
38
|
+
اربعة
|
39
|
+
اخرى
|
40
|
+
بان
|
41
|
+
اجل
|
42
|
+
غير
|
43
|
+
بشكل
|
44
|
+
حاليا
|
45
|
+
بن
|
46
|
+
به
|
47
|
+
ثم
|
48
|
+
اف
|
49
|
+
ان
|
50
|
+
او
|
51
|
+
اي
|
52
|
+
بها
|
53
|
+
صفر
|
54
|
+
حيث
|
55
|
+
اكد
|
56
|
+
الا
|
57
|
+
اما
|
58
|
+
امس
|
59
|
+
السابق
|
60
|
+
التى
|
61
|
+
التي
|
62
|
+
اكثر
|
63
|
+
ايار
|
64
|
+
ايضا
|
65
|
+
ثلاثة
|
66
|
+
الذاتي
|
67
|
+
الاخيرة
|
68
|
+
الثاني
|
69
|
+
الثانية
|
70
|
+
الذى
|
71
|
+
الذي
|
72
|
+
الان
|
73
|
+
امام
|
74
|
+
ايام
|
75
|
+
خلال
|
76
|
+
حوالى
|
77
|
+
الذين
|
78
|
+
الاول
|
79
|
+
الاولى
|
80
|
+
بين
|
81
|
+
ذلك
|
82
|
+
دون
|
83
|
+
حول
|
84
|
+
حين
|
85
|
+
الف
|
86
|
+
الى
|
87
|
+
انه
|
88
|
+
اول
|
89
|
+
ضمن
|
90
|
+
انها
|
91
|
+
جميع
|
92
|
+
الماضي
|
93
|
+
الوقت
|
94
|
+
المقبل
|
95
|
+
اليوم
|
96
|
+
ـ
|
97
|
+
ف
|
98
|
+
و
|
99
|
+
و6
|
100
|
+
قد
|
101
|
+
لا
|
102
|
+
ما
|
103
|
+
مع
|
104
|
+
مساء
|
105
|
+
هذا
|
106
|
+
واحد
|
107
|
+
واضاف
|
108
|
+
واضافت
|
109
|
+
فان
|
110
|
+
قبل
|
111
|
+
قال
|
112
|
+
كان
|
113
|
+
لدى
|
114
|
+
نحو
|
115
|
+
هذه
|
116
|
+
وان
|
117
|
+
واكد
|
118
|
+
كانت
|
119
|
+
واوضح
|
120
|
+
مايو
|
121
|
+
فى
|
122
|
+
في
|
123
|
+
كل
|
124
|
+
لم
|
125
|
+
لن
|
126
|
+
له
|
127
|
+
من
|
128
|
+
هو
|
129
|
+
هي
|
130
|
+
قوة
|
131
|
+
كما
|
132
|
+
لها
|
133
|
+
منذ
|
134
|
+
وقد
|
135
|
+
ولا
|
136
|
+
نفسه
|
137
|
+
لقاء
|
138
|
+
مقابل
|
139
|
+
هناك
|
140
|
+
وقال
|
141
|
+
وكان
|
142
|
+
نهاية
|
143
|
+
وقالت
|
144
|
+
وكانت
|
145
|
+
للامم
|
146
|
+
فيه
|
147
|
+
كلم
|
148
|
+
لكن
|
149
|
+
وفي
|
150
|
+
وقف
|
151
|
+
ولم
|
152
|
+
ومن
|
153
|
+
وهو
|
154
|
+
وهي
|
155
|
+
يوم
|
156
|
+
فيها
|
157
|
+
منها
|
158
|
+
مليار
|
159
|
+
لوكالة
|
160
|
+
يكون
|
161
|
+
يمكن
|
162
|
+
مليون
|
data/resources/ca.stop
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
de
|
2
|
+
es
|
3
|
+
i
|
4
|
+
a
|
5
|
+
o
|
6
|
+
un
|
7
|
+
una
|
8
|
+
unes
|
9
|
+
uns
|
10
|
+
un
|
11
|
+
tot
|
12
|
+
també
|
13
|
+
altre
|
14
|
+
algun
|
15
|
+
alguna
|
16
|
+
alguns
|
17
|
+
algunes
|
18
|
+
ser
|
19
|
+
és
|
20
|
+
soc
|
21
|
+
ets
|
22
|
+
som
|
23
|
+
estic
|
24
|
+
està
|
25
|
+
estem
|
26
|
+
esteu
|
27
|
+
estan
|
28
|
+
com
|
29
|
+
en
|
30
|
+
per
|
31
|
+
perquè
|
32
|
+
per que
|
33
|
+
estat
|
34
|
+
estava
|
35
|
+
ans
|
36
|
+
abans
|
37
|
+
éssent
|
38
|
+
ambdós
|
39
|
+
però
|
40
|
+
per
|
41
|
+
poder
|
42
|
+
potser
|
43
|
+
puc
|
44
|
+
podem
|
45
|
+
podeu
|
46
|
+
poden
|
47
|
+
vaig
|
48
|
+
va
|
49
|
+
van
|
50
|
+
fer faig
|
51
|
+
fa
|
52
|
+
fem
|
53
|
+
feu
|
54
|
+
fan
|
55
|
+
cada
|
56
|
+
fi
|
57
|
+
inclòs
|
58
|
+
primer
|
59
|
+
des de
|
60
|
+
conseguir
|
61
|
+
consegueixo
|
62
|
+
consigueix
|
63
|
+
consigueixes
|
64
|
+
conseguim
|
65
|
+
consigueixen
|
66
|
+
anar
|
67
|
+
haver
|
68
|
+
tenir
|
69
|
+
tinc
|
70
|
+
te
|
71
|
+
tenim
|
72
|
+
teniu
|
73
|
+
tene
|
74
|
+
el
|
75
|
+
la
|
76
|
+
les
|
77
|
+
els
|
78
|
+
seu
|
79
|
+
aquí
|
80
|
+
meu
|
81
|
+
teu
|
82
|
+
ells
|
83
|
+
elles
|
84
|
+
ens
|
85
|
+
nosaltres
|
86
|
+
vosaltres
|
87
|
+
si
|
88
|
+
dins
|
89
|
+
sols
|
90
|
+
solament
|
91
|
+
saber
|
92
|
+
saps
|
93
|
+
sap
|
94
|
+
sabem
|
95
|
+
sabeu
|
96
|
+
saben
|
97
|
+
últim
|
98
|
+
llarg
|
99
|
+
bastant fas
|
100
|
+
molts
|
101
|
+
aquells
|
102
|
+
aquelles
|
103
|
+
seus
|
104
|
+
llavors
|
105
|
+
sota
|
106
|
+
dalt
|
107
|
+
ús
|
108
|
+
molt
|
109
|
+
era
|
110
|
+
eres
|
111
|
+
erem
|
112
|
+
eren
|
113
|
+
mode
|
114
|
+
bé
|
115
|
+
quant
|
116
|
+
quan
|
117
|
+
on
|
118
|
+
mentre
|
119
|
+
qui
|
120
|
+
amb
|
121
|
+
entre
|
122
|
+
sense
|
123
|
+
jo
|
124
|
+
aquell
|
data/resources/cs.stop
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
dnes
|
2
|
+
cz
|
3
|
+
timto
|
4
|
+
budes
|
5
|
+
budem
|
6
|
+
byli
|
7
|
+
jses
|
8
|
+
muj
|
9
|
+
svym
|
10
|
+
ta
|
11
|
+
tomto
|
12
|
+
tohle
|
13
|
+
tuto
|
14
|
+
tyto
|
15
|
+
jej
|
16
|
+
zda
|
17
|
+
proc
|
18
|
+
mate
|
19
|
+
tato
|
20
|
+
kam
|
21
|
+
tohoto
|
22
|
+
kdo
|
23
|
+
kteri
|
24
|
+
mi
|
25
|
+
nam
|
26
|
+
tom
|
27
|
+
tomuto
|
28
|
+
mit
|
29
|
+
nic
|
30
|
+
proto
|
31
|
+
kterou
|
32
|
+
byla
|
33
|
+
toho
|
34
|
+
protoze
|
35
|
+
asi
|
36
|
+
ho
|
37
|
+
nasi
|
38
|
+
napiste
|
39
|
+
re
|
40
|
+
coz
|
41
|
+
tim
|
42
|
+
takze
|
43
|
+
svych
|
44
|
+
jeji
|
45
|
+
svymi
|
46
|
+
jste
|
47
|
+
aj
|
48
|
+
tu
|
49
|
+
tedy
|
50
|
+
teto bylo
|
51
|
+
kde
|
52
|
+
ke
|
53
|
+
prave
|
54
|
+
ji
|
55
|
+
nad
|
56
|
+
nejsou
|
57
|
+
ci
|
58
|
+
pod
|
59
|
+
tema
|
60
|
+
mezi
|
61
|
+
pres
|
62
|
+
ty
|
63
|
+
pak
|
64
|
+
vam
|
65
|
+
ani
|
66
|
+
kdyz
|
67
|
+
vsak
|
68
|
+
ne
|
69
|
+
jsem
|
70
|
+
tento
|
71
|
+
clanku
|
72
|
+
clanky
|
73
|
+
aby
|
74
|
+
jsme
|
75
|
+
pred
|
76
|
+
pta
|
77
|
+
jejich
|
78
|
+
byl
|
79
|
+
jeste
|
80
|
+
az
|
81
|
+
bez
|
82
|
+
take
|
83
|
+
pouze
|
84
|
+
prvni
|
85
|
+
vase
|
86
|
+
ktera
|
87
|
+
nas
|
88
|
+
novy
|
89
|
+
tipy
|
90
|
+
pokud
|
91
|
+
muze
|
92
|
+
design
|
93
|
+
strana
|
94
|
+
jeho
|
95
|
+
sve
|
96
|
+
jine
|
97
|
+
zpravy
|
98
|
+
nove
|
99
|
+
neni vas
|
100
|
+
jen
|
101
|
+
podle
|
102
|
+
zde
|
103
|
+
clanek
|
104
|
+
uz
|
105
|
+
email
|
106
|
+
byt
|
107
|
+
vice
|
108
|
+
bude
|
109
|
+
jiz
|
110
|
+
nez
|
111
|
+
ktery
|
112
|
+
by
|
113
|
+
ktere
|
114
|
+
co
|
115
|
+
nebo
|
116
|
+
ten
|
117
|
+
tak
|
118
|
+
ma
|
119
|
+
pri
|
120
|
+
od
|
121
|
+
po
|
122
|
+
jsou
|
123
|
+
jak
|
124
|
+
dalsi
|
125
|
+
ale
|
126
|
+
si
|
127
|
+
ve
|
128
|
+
to
|
129
|
+
jako
|
130
|
+
za
|
131
|
+
zpet
|
132
|
+
ze
|
133
|
+
do
|
134
|
+
pro
|
135
|
+
je
|
136
|
+
na
|