summa 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/summa.rb +193 -90
  2. metadata +2 -2
data/lib/summa.rb CHANGED
@@ -2,12 +2,12 @@ $:.unshift(File.dirname(__FILE__)) unless
2
2
  $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
3
 
4
4
  module Summa
5
- VERSION = '0.0.1'
5
+ VERSION = '0.0.2'
6
6
  end
7
7
 
8
8
  class String
9
9
  def stem
10
- puts "Just testing!"
10
+ puts "Still testing!... v0.0.2"
11
11
  end
12
12
  end
13
13
 
@@ -19,70 +19,69 @@ class PheremoneAnalysis
19
19
  end
20
20
  @document = document
21
21
  @keywords = keywords
22
- @sigma = constants.sigma
22
+ @sigma = 16
23
23
  @sigma_sq = @sigma * @sigma
24
- @threshold = constants.threshold
24
+ @threshold = 0.8
25
25
  @output = ""
26
26
  end
27
27
 
28
- def analyze()
29
- front = 1/(@sigma * Math.sqrt(2 * Math::PI))
30
-
31
- for i in 0 ... @document.wordArray.length
32
- for j in 0 ... @keywords.wordArray.length
33
- if @document.docArray[i][@keywords.wordArray[j]] != nil
34
- for pos in 0 ... @document.wordArray.length
35
- temp = front * Math.exp(-((i-pos)*(i-pos))/(2*@sigma_sq))
36
- @pheremones[pos] = @pheremones[pos] + temp
37
- end
38
- end
39
- end
40
- end
28
+ def analyze()
29
+ front = 1/(@sigma * Math.sqrt(2 * Math::PI))
30
+ for i in 0 ... @document.wordArray.length
31
+ for j in 0 ... @keywords.wordArray.length
32
+ if @document.docArray[i][@keywords.wordArray[j]] != nil
33
+ for pos in 0 ... @document.wordArray.length
34
+ temp = front * Math.exp(-((i-pos)*(i-pos))/(2*@sigma_sq))
35
+ @pheremones[pos] = @pheremones[pos] + temp
36
+ end
37
+ end
38
+ end
39
+ end
41
40
 
42
- max_value = -1;
43
- for i in 0 ... @pheremones.length
44
- if max_value < @pheremones[i]
45
- max_value = @pheremones[i];
46
- end
47
- end
41
+ max_value = -1;
42
+ for i in 0 ... @pheremones.length
43
+ if max_value < @pheremones[i]
44
+ max_value = @pheremones[i];
45
+ end
46
+ end
48
47
 
49
- for i in 0 ... @pheremones.length
50
- @pheremones[i] /= max_value;
51
- end
52
- end
48
+ for i in 0 ... @pheremones.length
49
+ @pheremones[i] /= max_value;
50
+ end
51
+ end
53
52
 
54
- def summarize()
55
- @output = ""
56
- inRegion = false
57
- for i in 0 ... @pheremones.length
58
- if @pheremones[i] >= @threshold
59
- if inRegion
60
- @output += @document.docArray[i];
61
- @output += " ";
62
- else
63
- inRegion = true
64
- @output += findStartOfSentence(i)
65
- @output += @document.docArray[i];
66
- @output += " "
67
- end
68
- else
69
- if inRegion
70
- inRegion = false
71
- @output += findEndOfSentence(i-1)
72
- @output += " "
73
- end
74
- end
75
- end
76
- @output
77
- end
53
+ def summarize()
54
+ @output = ""
55
+ inRegion = false
56
+ for i in 0 ... @pheremones.length
57
+ if @pheremones[i] >= @threshold
58
+ if inRegion
59
+ @output += @document.docArray[i];
60
+ @output += " ";
61
+ else
62
+ inRegion = true
63
+ @output += findStartOfSentence(i)
64
+ @output += @document.docArray[i];
65
+ @output += " "
66
+ end
67
+ else
68
+ if inRegion
69
+ inRegion = false
70
+ @output += findEndOfSentence(i-1)
71
+ @output += " "
72
+ end
73
+ end
74
+ end
75
+ @output
76
+ end
78
77
 
79
- def findStartOfSentence(i)
80
- index = i
81
- startIndex = 0
82
- local_output = ""
83
- while(index > 0)
84
- word = @document.docArray[index]
85
- if(word != "Mr." && word != "Mrs." && word != "Dr." &&
78
+ def findStartOfSentence(i)
79
+ index = i
80
+ startIndex = 0
81
+ local_output = ""
82
+ while(index > 0)
83
+ word = @document.docArray[index]
84
+ if(word != "Mr." && word != "Mrs." && word != "Dr." &&
86
85
  word != "U.S." && word != "Jan." &&
87
86
  word != "Feb." && word != "Mar." &&
88
87
  word != "Apr." && word != "May." &&
@@ -92,29 +91,29 @@ class PheremoneAnalysis
92
91
  word != "Dec." && word != "Sept." &&
93
92
  word != "Lt." && word != "Maj." &&
94
93
  word != "Col.")
95
- c = word[word.length - 1]
96
- if(c == "."[0] || c == ";"[0] || c == ":"[0])
97
- startIndex = index + 1
98
- break;
99
- end
100
- end
101
- index = index - 1
102
- end
94
+ c = word[word.length - 1]
95
+ if(c == "."[0] || c == ";"[0] || c == ":"[0])
96
+ startIndex = index + 1
97
+ break;
98
+ end
99
+ end
100
+ index = index - 1
101
+ end
103
102
 
104
- for j in startIndex ... i
105
- local_output += @document.docArray[j]
106
- local_output += " "
107
- end
108
- local_output
109
- end
103
+ for j in startIndex ... i
104
+ local_output += @document.docArray[j]
105
+ local_output += " "
106
+ end
107
+ local_output
108
+ end
110
109
 
111
- def findEndOfSentence(i)
112
- endIndex = @document.docArray.length
113
- index = i
114
- local_output = ""
115
- while(index < @document.docArray.length)
116
- word = @document.docArray[index]
117
- if(word != "Mr." && word != "Mrs." && word != "Dr." &&
110
+ def findEndOfSentence(i)
111
+ endIndex = @document.docArray.length
112
+ index = i
113
+ local_output = ""
114
+ while(index < @document.docArray.length)
115
+ word = @document.docArray[index]
116
+ if(word != "Mr." && word != "Mrs." && word != "Dr." &&
118
117
  word != "U.S." && word != "Jan." &&
119
118
  word != "Feb." && word != "Mar." &&
120
119
  word != "Apr." && word != "May." &&
@@ -124,25 +123,129 @@ class PheremoneAnalysis
124
123
  word != "Dec." && word != "Sept." &&
125
124
  word != "Lt." && word != "Maj." &&
126
125
  word != "Col.")
127
- c = word[word.length - 1]
128
- if(c == "."[0] || c == ";"[0] || c == ":"[0])
129
- endIndex = index
130
- break;
126
+ c = word[word.length - 1]
127
+ if(c == "."[0] || c == ";"[0] || c == ":"[0])
128
+ endIndex = index
129
+ break;
130
+ end
131
+ end
132
+ index = index + 1
133
+ end
134
+
135
+ if endIndex != i
136
+ for j in i+1 ... (endIndex + 1)
137
+ local_output += @document.docArray[j]
138
+ local_output += " "
139
+ end
140
+ end
141
+
142
+ local_output
143
+ end
144
+
145
+ attr_accessor :pheremones, :documet, :keywords, :sigma, :threshold, :output
146
+ end
147
+
148
+
149
+ class FrequencyAnalyzer
150
+ def initialize(document,stopWordsDoc)
151
+ @freqCount = {}
152
+ @document = document
153
+ @stopWords = stopWordsDoc
154
+ @mean = 0
155
+ @keywords = [];
156
+
157
+ for i in 0..@document.docArray.length
158
+ word = @document.docArray[i]
159
+ if word != nil
160
+ word = CGWordOps.removePunctuation(word)
161
+ if !@stopWords.docArray.include?(word.downcase)
162
+ #stemmed = word.stem
163
+ if @freqCount.has_key?(word)
164
+ @freqCount[word] =
165
+ @freqCount[word] + 1
166
+ else
167
+ @freqCount[word] = 1
168
+ end
131
169
  end
132
170
  end
133
- index = index + 1
134
171
  end
135
172
 
136
- if endIndex != i
137
- for j in i+1 ... (endIndex + 1)
138
- local_output += @document.docArray[j]
139
- local_output += " "
173
+ #@freqCount.each {|key, value| puts "#{key} is #{value}" }
174
+
175
+ sum = 0
176
+ count = 0
177
+ keys = @freqCount.keys
178
+ for i in 0..keys.length
179
+ if keys[i] != nil
180
+ sum = sum + @freqCount[keys[i]]
181
+ count = count + 1
140
182
  end
141
183
  end
184
+
185
+ @mean = sum/count
186
+ end
187
+
188
+ def analyze(k=3)
189
+ @keywords = [];
142
190
 
143
- local_output
191
+ keys = @freqCount.keys
192
+ for i in 0..keys.length
193
+ if keys[i] != nil
194
+ value = @freqCount[keys[i]]
195
+ if value > k * @mean && keys[i] != ""
196
+ @keywords << keys[i]
197
+ end
198
+ end
199
+ end
200
+ @keywords
201
+ end
202
+ attr_accessor :freqCount, :keywords
203
+ end
204
+
205
+
206
+ class CGWordOps
207
+ def self.removePunctuation(word)
208
+ output = word;
209
+ output = output.delete(".")
210
+ output = output.delete(",")
211
+ output = output.delete("--")
212
+ output = output.delete(";")
213
+ output = output.delete(":")
214
+ output = output.delete("(")
215
+ output = output.delete(")")
216
+ output = output.delete("[")
217
+ output = output.delete("]")
218
+ output = output.delete("?")
219
+ output = output.delete("!")
220
+ output = output.delete("\"")
221
+ output
144
222
  end
145
- attr_accessor :pheremones, :documet, :keywords, :sigma, :threshold, :output
146
223
  end
147
224
 
148
225
 
226
+ class CGDocument
227
+ def initialize(docName)
228
+ @docName = docName
229
+ @docFile = File.new(@docName)
230
+ line=""
231
+ @docArray = [];
232
+ @wordArray = [];
233
+ @docString = "";
234
+ while line != nil
235
+ line = @docFile.gets()
236
+ if line != nil
237
+ lineArray = line.split(" ")
238
+ for i in 0 ... lineArray.length
239
+ @docString += lineArray[i]
240
+ @docString += " "
241
+ @docArray << lineArray[i]
242
+ @wordArray << CGWordOps.removePunctuation(lineArray[i].stem).stem
243
+ end
244
+ end
245
+ end
246
+ end
247
+ #Create getters/setters.
248
+ attr_accessor :docArray, :wordArray, :docName, :docFile, :docString
249
+
250
+ end
251
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: summa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - HyLiter.org
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-09-21 00:00:00 -07:00
12
+ date: 2009-09-22 00:00:00 -07:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency