rsemantic 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -17,11 +17,14 @@ Documentation: http://github.com/josephwilk/rsemantic/wikis/home
17
17
  ## INSTALL:
18
18
 
19
19
  Note 'brew install GSL' installs 1.15 which is not supported yet by the gsl gem. So you have to switch your GSL version to 1.14.
20
- See here for details on how to do that with homebrew: http://bretthard.in/2012/03/getting-related_posts-lsi-and-gsl-to-work-in-jekyll/
20
+ With homebrew try this:
21
21
 
22
- <pre><code>git clone git://github.com/josephwilk/rsemantic.git
22
+ <pre><code>
23
+ git clone git://github.com/josephwilk/rsemantic.git
23
24
  cd rsemantic
24
- brew install GSL
25
+
26
+ brew tap homebrew/versions
27
+ brew install gsl114
25
28
  bundle install
26
29
  </code></pre>
27
30
 
data/TODO.txt CHANGED
@@ -1,9 +1,12 @@
1
1
  == FEATURES/PROBLEMS:
2
2
 
3
3
  * Applying transforms to query vectors
4
- * Detect the optimal dimension reduction in LSA.
5
4
  * Allow objects to be passed in as transforms.
5
+ * Hashes might be enough, but a faster data structure might be a good option.
6
+ * Detect the optimal dimension reduction in LSA.
7
+ * This needs some benchmark. Low number of dimensions can be effective enough.
8
+ * http://nlp.stanford.edu/IR-book/html/htmledition/latent-semantic-indexing-1.html
6
9
  * Implement Probabilistic latent semantic analysis
7
10
  * Implement Latent Dirichlet Allocation
8
11
 
9
- * Matrix transformer has to popout the matrix of VectorSpace::Model and reassign it, get rid of this.
12
+ * Matrix transformer has to popout the matrix of VectorSpace::Model and reassign it, get rid of this.
@@ -1,5 +1,5 @@
1
- require 'stemmer'
2
1
  require "set"
2
+
3
3
  module Semantic
4
4
  class Parser
5
5
 
@@ -8,8 +8,10 @@ module Semantic
8
8
  # TODO: nicer way to reference stop file location?
9
9
  @filter_stop_words = options[:filter_stop_words]
10
10
  @stem_words = options[:stem_words]
11
+ locale = options[:locale] || 'en'
12
+
11
13
  if @filter_stop_words
12
- File.open(File.dirname(__FILE__) + '/../../resources/english.stop', 'r') do |file|
14
+ File.open("#{File.dirname(__FILE__)}/../../resources/#{locale}.stop", 'r') do |file|
13
15
  @stopwords = Set.new(file.read().split())
14
16
  end
15
17
  end
@@ -42,7 +44,7 @@ module Semantic
42
44
  words = string.split(" ")
43
45
 
44
46
  if @stem_words
45
- words.map {|word| Stemmer.stem_word(word) }
47
+ words.map(&:stem)
46
48
  else
47
49
  words
48
50
  end
@@ -8,13 +8,10 @@ module Semantic
8
8
  @@number_of_documents_with_term = []
9
9
 
10
10
  matrix.transpose.enum_for(:each_row).with_index do |document, column_index|
11
- document_term_total = document.sum
12
-
13
11
  document.enum_for(:each).with_index do |term_weight, row_index|
14
12
  unless term_weight == 0.0
15
- inverse_document_frequency = GSL::Sf.log((number_of_documents /
16
- number_of_documents_with_term(row_index, matrix).to_f).abs)
17
- term_frequency = (term_weight / document_term_total)
13
+ inverse_document_frequency = 1 + GSL::Sf.log(number_of_documents / (number_of_documents_with_term(row_index, matrix).to_f + 1))
14
+ term_frequency = Math.sqrt(term_weight)
18
15
 
19
16
  matrix[row_index, column_index] = term_frequency * inverse_document_frequency
20
17
  end
@@ -69,7 +69,7 @@ module Semantic
69
69
  end
70
70
  }
71
71
 
72
- vector
72
+ vector.respond_to?(:to_v) ? vector.to_v : vector
73
73
  end
74
74
  end
75
75
  end
@@ -22,7 +22,7 @@ module Semantic
22
22
 
23
23
  def to_s
24
24
  out = StringIO.new
25
- out.print " " * 9
25
+ out.print " " * 12
26
26
 
27
27
  matrix.size2.times do |id|
28
28
  out.print " D#{id+1} "
@@ -30,8 +30,13 @@ module Semantic
30
30
  out.puts
31
31
 
32
32
  matrix.to_a.each_with_index do |terms, index|
33
- # TODO fix for 1.8.7
34
- out.print "#{@keywords.key(index).ljust(6)}" if @keywords.has_value?(index)
33
+
34
+ if @keywords.has_value?(index)
35
+ index_position = @keywords.values.index(index)
36
+ key = @keywords.keys[index_position]
37
+
38
+ out.print "#{key.ljust(10)}"
39
+ end
35
40
  out.print "[ "
36
41
 
37
42
  terms.each do |document|
@@ -40,6 +45,7 @@ module Semantic
40
45
  out.print "]"
41
46
  out.puts
42
47
  end
48
+
43
49
  out.string
44
50
  end
45
51
 
@@ -1,8 +1,8 @@
1
1
  module Semantic #:nodoc:
2
2
  class VERSION #:nodoc:
3
3
  MAJOR = 0
4
- MINOR = 1
5
- TINY = 4
4
+ MINOR = 2
5
+ TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
data/lib/tasks/rspec.rake CHANGED
@@ -1,11 +1,15 @@
1
1
  require 'rspec/core/rake_task'
2
2
 
3
3
  desc 'Default: run specs.'
4
- task :default => :spec
4
+ task :default => [:spec, :integration]
5
5
 
6
6
  desc "Run specs"
7
7
  RSpec::Core::RakeTask.new
8
8
 
9
+ RSpec::Core::RakeTask.new(:integration) do |t|
10
+ t.pattern = 'spec_integration/*_spec.rb'
11
+ end
12
+
9
13
  desc "Generate code coverage"
10
14
  RSpec::Core::RakeTask.new(:coverage) do |t|
11
15
  t.rcov = true
data/resources/ar.stop ADDED
@@ -0,0 +1,162 @@
1
+ ب
2
+ ا
3
+ أ
4
+ ،
5
+ عشر
6
+ عدد
7
+ عدة
8
+ عشرة
9
+ عدم
10
+ عام
11
+ عاما
12
+ عن
13
+ عند
14
+ عندما
15
+ على
16
+ عليه
17
+ عليها
18
+ زيارة
19
+ سنة
20
+ سنوات
21
+ تم
22
+ ضد
23
+ بعد
24
+ بعض
25
+ اعادة
26
+ اعلنت
27
+ بسبب
28
+ حتى
29
+ اذا
30
+ احد
31
+ اثر
32
+ برس
33
+ باسم
34
+ غدا
35
+ شخصا
36
+ صباح
37
+ اطار
38
+ اربعة
39
+ اخرى
40
+ بان
41
+ اجل
42
+ غير
43
+ بشكل
44
+ حاليا
45
+ بن
46
+ به
47
+ ثم
48
+ اف
49
+ ان
50
+ او
51
+ اي
52
+ بها
53
+ صفر
54
+ حيث
55
+ اكد
56
+ الا
57
+ اما
58
+ امس
59
+ السابق
60
+ التى
61
+ التي
62
+ اكثر
63
+ ايار
64
+ ايضا
65
+ ثلاثة
66
+ الذاتي
67
+ الاخيرة
68
+ الثاني
69
+ الثانية
70
+ الذى
71
+ الذي
72
+ الان
73
+ امام
74
+ ايام
75
+ خلال
76
+ حوالى
77
+ الذين
78
+ الاول
79
+ الاولى
80
+ بين
81
+ ذلك
82
+ دون
83
+ حول
84
+ حين
85
+ الف
86
+ الى
87
+ انه
88
+ اول
89
+ ضمن
90
+ انها
91
+ جميع
92
+ الماضي
93
+ الوقت
94
+ المقبل
95
+ اليوم
96
+ ـ
97
+ ف
98
+ و
99
+ و6
100
+ قد
101
+ لا
102
+ ما
103
+ مع
104
+ مساء
105
+ هذا
106
+ واحد
107
+ واضاف
108
+ واضافت
109
+ فان
110
+ قبل
111
+ قال
112
+ كان
113
+ لدى
114
+ نحو
115
+ هذه
116
+ وان
117
+ واكد
118
+ كانت
119
+ واوضح
120
+ مايو
121
+ فى
122
+ في
123
+ كل
124
+ لم
125
+ لن
126
+ له
127
+ من
128
+ هو
129
+ هي
130
+ قوة
131
+ كما
132
+ لها
133
+ منذ
134
+ وقد
135
+ ولا
136
+ نفسه
137
+ لقاء
138
+ مقابل
139
+ هناك
140
+ وقال
141
+ وكان
142
+ نهاية
143
+ وقالت
144
+ وكانت
145
+ للامم
146
+ فيه
147
+ كلم
148
+ لكن
149
+ وفي
150
+ وقف
151
+ ولم
152
+ ومن
153
+ وهو
154
+ وهي
155
+ يوم
156
+ فيها
157
+ منها
158
+ مليار
159
+ لوكالة
160
+ يكون
161
+ يمكن
162
+ مليون
data/resources/ca.stop ADDED
@@ -0,0 +1,124 @@
1
+ de
2
+ es
3
+ i
4
+ a
5
+ o
6
+ un
7
+ una
8
+ unes
9
+ uns
10
+ un
11
+ tot
12
+ també
13
+ altre
14
+ algun
15
+ alguna
16
+ alguns
17
+ algunes
18
+ ser
19
+ és
20
+ soc
21
+ ets
22
+ som
23
+ estic
24
+ està
25
+ estem
26
+ esteu
27
+ estan
28
+ com
29
+ en
30
+ per
31
+ perquè
32
+ per que
33
+ estat
34
+ estava
35
+ ans
36
+ abans
37
+ éssent
38
+ ambdós
39
+ però
40
+ per
41
+ poder
42
+ potser
43
+ puc
44
+ podem
45
+ podeu
46
+ poden
47
+ vaig
48
+ va
49
+ van
50
+ fer faig
51
+ fa
52
+ fem
53
+ feu
54
+ fan
55
+ cada
56
+ fi
57
+ inclòs
58
+ primer
59
+ des de
60
+ conseguir
61
+ consegueixo
62
+ consigueix
63
+ consigueixes
64
+ conseguim
65
+ consigueixen
66
+ anar
67
+ haver
68
+ tenir
69
+ tinc
70
+ te
71
+ tenim
72
+ teniu
73
+ tene
74
+ el
75
+ la
76
+ les
77
+ els
78
+ seu
79
+ aquí
80
+ meu
81
+ teu
82
+ ells
83
+ elles
84
+ ens
85
+ nosaltres
86
+ vosaltres
87
+ si
88
+ dins
89
+ sols
90
+ solament
91
+ saber
92
+ saps
93
+ sap
94
+ sabem
95
+ sabeu
96
+ saben
97
+ últim
98
+ llarg
99
+ bastant fas
100
+ molts
101
+ aquells
102
+ aquelles
103
+ seus
104
+ llavors
105
+ sota
106
+ dalt
107
+ ús
108
+ molt
109
+ era
110
+ eres
111
+ erem
112
+ eren
113
+ mode
114
+
115
+ quant
116
+ quan
117
+ on
118
+ mentre
119
+ qui
120
+ amb
121
+ entre
122
+ sense
123
+ jo
124
+ aquell
data/resources/cs.stop ADDED
@@ -0,0 +1,136 @@
1
+ dnes
2
+ cz
3
+ timto
4
+ budes
5
+ budem
6
+ byli
7
+ jses
8
+ muj
9
+ svym
10
+ ta
11
+ tomto
12
+ tohle
13
+ tuto
14
+ tyto
15
+ jej
16
+ zda
17
+ proc
18
+ mate
19
+ tato
20
+ kam
21
+ tohoto
22
+ kdo
23
+ kteri
24
+ mi
25
+ nam
26
+ tom
27
+ tomuto
28
+ mit
29
+ nic
30
+ proto
31
+ kterou
32
+ byla
33
+ toho
34
+ protoze
35
+ asi
36
+ ho
37
+ nasi
38
+ napiste
39
+ re
40
+ coz
41
+ tim
42
+ takze
43
+ svych
44
+ jeji
45
+ svymi
46
+ jste
47
+ aj
48
+ tu
49
+ tedy
50
+ teto bylo
51
+ kde
52
+ ke
53
+ prave
54
+ ji
55
+ nad
56
+ nejsou
57
+ ci
58
+ pod
59
+ tema
60
+ mezi
61
+ pres
62
+ ty
63
+ pak
64
+ vam
65
+ ani
66
+ kdyz
67
+ vsak
68
+ ne
69
+ jsem
70
+ tento
71
+ clanku
72
+ clanky
73
+ aby
74
+ jsme
75
+ pred
76
+ pta
77
+ jejich
78
+ byl
79
+ jeste
80
+ az
81
+ bez
82
+ take
83
+ pouze
84
+ prvni
85
+ vase
86
+ ktera
87
+ nas
88
+ novy
89
+ tipy
90
+ pokud
91
+ muze
92
+ design
93
+ strana
94
+ jeho
95
+ sve
96
+ jine
97
+ zpravy
98
+ nove
99
+ neni vas
100
+ jen
101
+ podle
102
+ zde
103
+ clanek
104
+ uz
105
+ email
106
+ byt
107
+ vice
108
+ bude
109
+ jiz
110
+ nez
111
+ ktery
112
+ by
113
+ ktere
114
+ co
115
+ nebo
116
+ ten
117
+ tak
118
+ ma
119
+ pri
120
+ od
121
+ po
122
+ jsou
123
+ jak
124
+ dalsi
125
+ ale
126
+ si
127
+ ve
128
+ to
129
+ jako
130
+ za
131
+ zpet
132
+ ze
133
+ do
134
+ pro
135
+ je
136
+ na