summa 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/summa.rb +193 -90
- metadata +2 -2
data/lib/summa.rb
CHANGED
@@ -2,12 +2,12 @@ $:.unshift(File.dirname(__FILE__)) unless
|
|
2
2
|
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
3
|
|
4
4
|
module Summa
|
5
|
-
VERSION = '0.0.
|
5
|
+
VERSION = '0.0.2'
|
6
6
|
end
|
7
7
|
|
8
8
|
class String
|
9
9
|
def stem
|
10
|
-
puts "
|
10
|
+
puts "Still testing!... v0.0.2"
|
11
11
|
end
|
12
12
|
end
|
13
13
|
|
@@ -19,70 +19,69 @@ class PheremoneAnalysis
|
|
19
19
|
end
|
20
20
|
@document = document
|
21
21
|
@keywords = keywords
|
22
|
-
@sigma =
|
22
|
+
@sigma = 16
|
23
23
|
@sigma_sq = @sigma * @sigma
|
24
|
-
@threshold =
|
24
|
+
@threshold = 0.8
|
25
25
|
@output = ""
|
26
26
|
end
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
end
|
28
|
+
def analyze()
|
29
|
+
front = 1/(@sigma * Math.sqrt(2 * Math::PI))
|
30
|
+
for i in 0 ... @document.wordArray.length
|
31
|
+
for j in 0 ... @keywords.wordArray.length
|
32
|
+
if @document.docArray[i][@keywords.wordArray[j]] != nil
|
33
|
+
for pos in 0 ... @document.wordArray.length
|
34
|
+
temp = front * Math.exp(-((i-pos)*(i-pos))/(2*@sigma_sq))
|
35
|
+
@pheremones[pos] = @pheremones[pos] + temp
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
41
40
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
41
|
+
max_value = -1;
|
42
|
+
for i in 0 ... @pheremones.length
|
43
|
+
if max_value < @pheremones[i]
|
44
|
+
max_value = @pheremones[i];
|
45
|
+
end
|
46
|
+
end
|
48
47
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
48
|
+
for i in 0 ... @pheremones.length
|
49
|
+
@pheremones[i] /= max_value;
|
50
|
+
end
|
51
|
+
end
|
53
52
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
53
|
+
def summarize()
|
54
|
+
@output = ""
|
55
|
+
inRegion = false
|
56
|
+
for i in 0 ... @pheremones.length
|
57
|
+
if @pheremones[i] >= @threshold
|
58
|
+
if inRegion
|
59
|
+
@output += @document.docArray[i];
|
60
|
+
@output += " ";
|
61
|
+
else
|
62
|
+
inRegion = true
|
63
|
+
@output += findStartOfSentence(i)
|
64
|
+
@output += @document.docArray[i];
|
65
|
+
@output += " "
|
66
|
+
end
|
67
|
+
else
|
68
|
+
if inRegion
|
69
|
+
inRegion = false
|
70
|
+
@output += findEndOfSentence(i-1)
|
71
|
+
@output += " "
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
@output
|
76
|
+
end
|
78
77
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
78
|
+
def findStartOfSentence(i)
|
79
|
+
index = i
|
80
|
+
startIndex = 0
|
81
|
+
local_output = ""
|
82
|
+
while(index > 0)
|
83
|
+
word = @document.docArray[index]
|
84
|
+
if(word != "Mr." && word != "Mrs." && word != "Dr." &&
|
86
85
|
word != "U.S." && word != "Jan." &&
|
87
86
|
word != "Feb." && word != "Mar." &&
|
88
87
|
word != "Apr." && word != "May." &&
|
@@ -92,29 +91,29 @@ class PheremoneAnalysis
|
|
92
91
|
word != "Dec." && word != "Sept." &&
|
93
92
|
word != "Lt." && word != "Maj." &&
|
94
93
|
word != "Col.")
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
94
|
+
c = word[word.length - 1]
|
95
|
+
if(c == "."[0] || c == ";"[0] || c == ":"[0])
|
96
|
+
startIndex = index + 1
|
97
|
+
break;
|
98
|
+
end
|
99
|
+
end
|
100
|
+
index = index - 1
|
101
|
+
end
|
103
102
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
103
|
+
for j in startIndex ... i
|
104
|
+
local_output += @document.docArray[j]
|
105
|
+
local_output += " "
|
106
|
+
end
|
107
|
+
local_output
|
108
|
+
end
|
110
109
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
110
|
+
def findEndOfSentence(i)
|
111
|
+
endIndex = @document.docArray.length
|
112
|
+
index = i
|
113
|
+
local_output = ""
|
114
|
+
while(index < @document.docArray.length)
|
115
|
+
word = @document.docArray[index]
|
116
|
+
if(word != "Mr." && word != "Mrs." && word != "Dr." &&
|
118
117
|
word != "U.S." && word != "Jan." &&
|
119
118
|
word != "Feb." && word != "Mar." &&
|
120
119
|
word != "Apr." && word != "May." &&
|
@@ -124,25 +123,129 @@ class PheremoneAnalysis
|
|
124
123
|
word != "Dec." && word != "Sept." &&
|
125
124
|
word != "Lt." && word != "Maj." &&
|
126
125
|
word != "Col.")
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
126
|
+
c = word[word.length - 1]
|
127
|
+
if(c == "."[0] || c == ";"[0] || c == ":"[0])
|
128
|
+
endIndex = index
|
129
|
+
break;
|
130
|
+
end
|
131
|
+
end
|
132
|
+
index = index + 1
|
133
|
+
end
|
134
|
+
|
135
|
+
if endIndex != i
|
136
|
+
for j in i+1 ... (endIndex + 1)
|
137
|
+
local_output += @document.docArray[j]
|
138
|
+
local_output += " "
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
local_output
|
143
|
+
end
|
144
|
+
|
145
|
+
attr_accessor :pheremones, :documet, :keywords, :sigma, :threshold, :output
|
146
|
+
end
|
147
|
+
|
148
|
+
|
149
|
+
class FrequencyAnalyzer
|
150
|
+
def initialize(document,stopWordsDoc)
|
151
|
+
@freqCount = {}
|
152
|
+
@document = document
|
153
|
+
@stopWords = stopWordsDoc
|
154
|
+
@mean = 0
|
155
|
+
@keywords = [];
|
156
|
+
|
157
|
+
for i in 0..@document.docArray.length
|
158
|
+
word = @document.docArray[i]
|
159
|
+
if word != nil
|
160
|
+
word = CGWordOps.removePunctuation(word)
|
161
|
+
if !@stopWords.docArray.include?(word.downcase)
|
162
|
+
#stemmed = word.stem
|
163
|
+
if @freqCount.has_key?(word)
|
164
|
+
@freqCount[word] =
|
165
|
+
@freqCount[word] + 1
|
166
|
+
else
|
167
|
+
@freqCount[word] = 1
|
168
|
+
end
|
131
169
|
end
|
132
170
|
end
|
133
|
-
index = index + 1
|
134
171
|
end
|
135
172
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
173
|
+
#@freqCount.each {|key, value| puts "#{key} is #{value}" }
|
174
|
+
|
175
|
+
sum = 0
|
176
|
+
count = 0
|
177
|
+
keys = @freqCount.keys
|
178
|
+
for i in 0..keys.length
|
179
|
+
if keys[i] != nil
|
180
|
+
sum = sum + @freqCount[keys[i]]
|
181
|
+
count = count + 1
|
140
182
|
end
|
141
183
|
end
|
184
|
+
|
185
|
+
@mean = sum/count
|
186
|
+
end
|
187
|
+
|
188
|
+
def analyze(k=3)
|
189
|
+
@keywords = [];
|
142
190
|
|
143
|
-
|
191
|
+
keys = @freqCount.keys
|
192
|
+
for i in 0..keys.length
|
193
|
+
if keys[i] != nil
|
194
|
+
value = @freqCount[keys[i]]
|
195
|
+
if value > k * @mean && keys[i] != ""
|
196
|
+
@keywords << keys[i]
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
@keywords
|
201
|
+
end
|
202
|
+
attr_accessor :freqCount, :keywords
|
203
|
+
end
|
204
|
+
|
205
|
+
|
206
|
+
class CGWordOps
|
207
|
+
def self.removePunctuation(word)
|
208
|
+
output = word;
|
209
|
+
output = output.delete(".")
|
210
|
+
output = output.delete(",")
|
211
|
+
output = output.delete("--")
|
212
|
+
output = output.delete(";")
|
213
|
+
output = output.delete(":")
|
214
|
+
output = output.delete("(")
|
215
|
+
output = output.delete(")")
|
216
|
+
output = output.delete("[")
|
217
|
+
output = output.delete("]")
|
218
|
+
output = output.delete("?")
|
219
|
+
output = output.delete("!")
|
220
|
+
output = output.delete("\"")
|
221
|
+
output
|
144
222
|
end
|
145
|
-
attr_accessor :pheremones, :documet, :keywords, :sigma, :threshold, :output
|
146
223
|
end
|
147
224
|
|
148
225
|
|
226
|
+
class CGDocument
|
227
|
+
def initialize(docName)
|
228
|
+
@docName = docName
|
229
|
+
@docFile = File.new(@docName)
|
230
|
+
line=""
|
231
|
+
@docArray = [];
|
232
|
+
@wordArray = [];
|
233
|
+
@docString = "";
|
234
|
+
while line != nil
|
235
|
+
line = @docFile.gets()
|
236
|
+
if line != nil
|
237
|
+
lineArray = line.split(" ")
|
238
|
+
for i in 0 ... lineArray.length
|
239
|
+
@docString += lineArray[i]
|
240
|
+
@docString += " "
|
241
|
+
@docArray << lineArray[i]
|
242
|
+
@wordArray << CGWordOps.removePunctuation(lineArray[i].stem).stem
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
#Create getters/setters.
|
248
|
+
attr_accessor :docArray, :wordArray, :docName, :docFile, :docString
|
249
|
+
|
250
|
+
end
|
251
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: summa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- HyLiter.org
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-09-
|
12
|
+
date: 2009-09-22 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|