highscore 0.2.1 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/README.md +9 -2
- data/highscore.gemspec +34 -0
- data/lib/highscore/content.rb +5 -18
- data/lib/highscore/keyword.rb +19 -0
- data/lib/highscore/keywords.rb +47 -28
- data/test/highscore/test_content.rb +7 -0
- data/test/highscore/test_keyword.rb +31 -0
- data/test/highscore/test_keywords.rb +19 -17
- data/version.txt +1 -1
- metadata +8 -4
data/History.txt
CHANGED
data/README.md
CHANGED
@@ -19,8 +19,15 @@ Examples
|
|
19
19
|
set :long_words_threshold, 15
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
text.keywords.
|
22
|
+
# get all keywords
|
23
|
+
text.keywords.rank => Array
|
24
|
+
|
25
|
+
# get the top 50 keywords
|
26
|
+
text.keywords.top(50).each do |keyword|
|
27
|
+
keyword.text # => keyword text
|
28
|
+
keyword.weight # => rank weight (float)
|
29
|
+
end
|
30
|
+
|
24
31
|
|
25
32
|
Requirements
|
26
33
|
------------
|
data/highscore.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = "highscore"
|
5
|
+
s.version = "0.3.0"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Dominik Liebler"]
|
9
|
+
s.date = "2012-01-21"
|
10
|
+
s.description = "Rank keywords in long texts."
|
11
|
+
s.email = "liebler.dominik@googlemail.com"
|
12
|
+
s.executables = ["highscore"]
|
13
|
+
s.extra_rdoc_files = ["History.txt", "bin/highscore"]
|
14
|
+
s.files = [".gitignore", "History.txt", "README.md", "Rakefile", "bin/highscore", "lib/highscore.rb", "lib/highscore/content.rb", "lib/highscore/keyword.rb", "lib/highscore/keywords.rb", "test/highscore/test_content.rb", "test/highscore/test_keyword.rb", "test/highscore/test_keywords.rb", "test/test_highscore.rb", "version.txt"]
|
15
|
+
s.homepage = "http://thewebdev.de"
|
16
|
+
s.rdoc_options = ["--main", "README.md"]
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.rubyforge_project = "highscore"
|
19
|
+
s.rubygems_version = "1.8.11"
|
20
|
+
s.summary = "Rank keywords in long texts."
|
21
|
+
s.test_files = ["test/highscore/test_content.rb", "test/highscore/test_keyword.rb", "test/highscore/test_keywords.rb", "test/test_highscore.rb"]
|
22
|
+
|
23
|
+
if s.respond_to? :specification_version then
|
24
|
+
s.specification_version = 3
|
25
|
+
|
26
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
27
|
+
s.add_development_dependency(%q<bones>, [">= 3.7.3"])
|
28
|
+
else
|
29
|
+
s.add_dependency(%q<bones>, [">= 3.7.3"])
|
30
|
+
end
|
31
|
+
else
|
32
|
+
s.add_dependency(%q<bones>, [">= 3.7.3"])
|
33
|
+
end
|
34
|
+
end
|
data/lib/highscore/content.rb
CHANGED
@@ -34,36 +34,23 @@ module Highscore
|
|
34
34
|
# keywords -> Keywords
|
35
35
|
#
|
36
36
|
def keywords
|
37
|
-
keywords = Keywords.new
|
37
|
+
keywords = Keywords.new
|
38
38
|
|
39
|
-
find_keywords.each do |
|
39
|
+
Keywords.find_keywords(@content).each do |text|
|
40
40
|
weight = @emphasis[:multiplier]
|
41
41
|
|
42
|
-
if
|
42
|
+
if text.length >= @emphasis[:long_words_threshold]
|
43
43
|
weight *= @emphasis[:long_words]
|
44
44
|
end
|
45
45
|
|
46
|
-
if
|
46
|
+
if text[0] == text[0].upcase
|
47
47
|
weight *= @emphasis[:upper_case]
|
48
48
|
end
|
49
49
|
|
50
|
-
keywords
|
50
|
+
keywords << Highscore::Keyword.new(text, weight)
|
51
51
|
end
|
52
52
|
|
53
53
|
keywords
|
54
54
|
end
|
55
|
-
|
56
|
-
private
|
57
|
-
|
58
|
-
# find keywords in the content and rate them
|
59
|
-
#
|
60
|
-
def find_keywords
|
61
|
-
keywords = @content.scan(/\w+/)
|
62
|
-
keywords.delete_if do |x|
|
63
|
-
x.match(/^[\d]+(\.[\d]+){0,1}$/) or x.length <= 2
|
64
|
-
end
|
65
|
-
|
66
|
-
keywords.sort
|
67
|
-
end
|
68
55
|
end
|
69
56
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Highscore
|
2
|
+
|
3
|
+
# keywords read from the content
|
4
|
+
#
|
5
|
+
class Keyword
|
6
|
+
attr_accessor :weight, :text
|
7
|
+
|
8
|
+
# init a keyword
|
9
|
+
def initialize(text, weight)
|
10
|
+
@text = text
|
11
|
+
@weight = weight.to_f
|
12
|
+
end
|
13
|
+
|
14
|
+
# sort keywords
|
15
|
+
def <=>(other)
|
16
|
+
other.weight <=> @weight
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/highscore/keywords.rb
CHANGED
@@ -1,7 +1,32 @@
|
|
1
|
+
# external
|
2
|
+
require 'digest/sha1'
|
3
|
+
|
1
4
|
module Highscore
|
5
|
+
|
2
6
|
# keywords that were found in content
|
3
7
|
#
|
4
|
-
class Keywords
|
8
|
+
class Keywords
|
9
|
+
include Enumerable
|
10
|
+
|
11
|
+
# find keywords in a piece of content
|
12
|
+
def self.find_keywords content
|
13
|
+
keywords = content.scan(/\w+/)
|
14
|
+
keywords.delete_if do |x|
|
15
|
+
x.match(/^[\d]+(\.[\d]+){0,1}$/) or x.length <= 2
|
16
|
+
end
|
17
|
+
|
18
|
+
keywords.delete_if do |key, value|
|
19
|
+
%w{the and that post add not see about using some something under our comments comment run you want for will file are with end new this use all but can your just get very data blog format out first they posts second}.include? key.downcase
|
20
|
+
end
|
21
|
+
|
22
|
+
keywords.sort
|
23
|
+
end
|
24
|
+
|
25
|
+
# init a new keyword collection
|
26
|
+
#
|
27
|
+
def initialize
|
28
|
+
@keywords = {}
|
29
|
+
end
|
5
30
|
|
6
31
|
# ranks the keywords and removes keywords that have a low ranking
|
7
32
|
# or are blacklisted
|
@@ -10,50 +35,44 @@ module Highscore
|
|
10
35
|
# rank -> array
|
11
36
|
#
|
12
37
|
def rank
|
13
|
-
|
14
|
-
sort_it
|
38
|
+
sort
|
15
39
|
end
|
16
40
|
|
17
41
|
# get the top n keywords
|
18
42
|
#
|
19
43
|
def top n = 10
|
20
|
-
filter
|
21
44
|
rank[0..(n - 1)]
|
22
45
|
end
|
23
46
|
|
24
|
-
#
|
25
|
-
#
|
26
|
-
# :call-seq:
|
27
|
-
# sort_it -> array
|
47
|
+
# add new keywords
|
28
48
|
#
|
29
|
-
def
|
30
|
-
|
49
|
+
def <<(keyword)
|
50
|
+
key = Digest::SHA1.hexdigest(keyword.text)
|
51
|
+
|
52
|
+
if @keywords.has_key?(key)
|
53
|
+
@keywords[key].weight += keyword.weight
|
54
|
+
else
|
55
|
+
@keywords[key] = keyword
|
56
|
+
end
|
31
57
|
end
|
32
58
|
|
33
|
-
|
59
|
+
# sort
|
60
|
+
def sort
|
61
|
+
sorted = @keywords.sort {|a,b| a[1] <=> b[1] }
|
34
62
|
|
35
|
-
|
36
|
-
|
37
|
-
def filter
|
38
|
-
run_blacklist
|
39
|
-
filter_low
|
63
|
+
# convert Array from sort back to Array of Keyword objects
|
64
|
+
sorted.collect {|x| x[1]}
|
40
65
|
end
|
41
66
|
|
42
|
-
#
|
67
|
+
# Enumerable
|
43
68
|
#
|
44
|
-
def
|
45
|
-
|
46
|
-
value <= 0
|
47
|
-
end
|
69
|
+
def each &block
|
70
|
+
@keywords.each {|keyword| keyword.call(keyword)}
|
48
71
|
end
|
49
72
|
|
50
|
-
#
|
51
|
-
|
52
|
-
|
53
|
-
# FIXME: add more keywords!
|
54
|
-
delete_if do |key, value|
|
55
|
-
%w{the and that post add not see about using some something under our comments comment run you want for will file are with end new this use all but can your just get very data blog format out first they posts second}.include? key.downcase
|
56
|
-
end
|
73
|
+
# number of Keywords given
|
74
|
+
def length
|
75
|
+
@keywords.length
|
57
76
|
end
|
58
77
|
end
|
59
78
|
end
|
@@ -15,4 +15,11 @@ class TestContent < Test::Unit::TestCase
|
|
15
15
|
def test_keywords
|
16
16
|
assert_instance_of(Highscore::Keywords, @content.keywords)
|
17
17
|
end
|
18
|
+
|
19
|
+
def test_multiple_keywords
|
20
|
+
content = 'Ruby Ruby Ruby and so forth ...'
|
21
|
+
|
22
|
+
content = Highscore::Content.new content
|
23
|
+
assert_equal 2, content.keywords.length
|
24
|
+
end
|
18
25
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
$:.unshift(File.join(File.dirname(__FILE__), %w{.. .. lib highscore}))
|
2
|
+
require 'keyword'
|
3
|
+
require "test/unit"
|
4
|
+
|
5
|
+
class TestKeyword < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
@keyword = Highscore::Keyword.new('Ruby', 2)
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_init
|
11
|
+
|
12
|
+
# don't allow 'empty' keywords
|
13
|
+
assert_raise(ArgumentError) do
|
14
|
+
Highscore::Keyword.new
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_text
|
19
|
+
assert_equal 'Ruby', @keyword.text
|
20
|
+
|
21
|
+
@keyword.text = 'Foobar'
|
22
|
+
assert_equal 'Foobar', @keyword.text
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_weight
|
26
|
+
assert_equal 2, @keyword.weight
|
27
|
+
|
28
|
+
@keyword.weight = 10.123
|
29
|
+
assert_equal 10.123, @keyword.weight
|
30
|
+
end
|
31
|
+
end
|
@@ -1,14 +1,15 @@
|
|
1
1
|
$:.unshift(File.join(File.dirname(__FILE__), %w{.. .. lib highscore}))
|
2
|
-
require
|
2
|
+
require 'keywords'
|
3
|
+
require 'keyword'
|
3
4
|
require "test/unit"
|
4
5
|
|
5
6
|
class TestKeywords < Test::Unit::TestCase
|
6
7
|
def setup
|
7
8
|
@keywords = Highscore::Keywords.new
|
8
|
-
@keywords
|
9
|
-
@keywords
|
10
|
-
@keywords
|
11
|
-
@keywords
|
9
|
+
@keywords << Highscore::Keyword.new('Ruby', 2)
|
10
|
+
@keywords << Highscore::Keyword.new('Sinatra', 3)
|
11
|
+
@keywords << Highscore::Keyword.new('Highscore', 1)
|
12
|
+
@keywords << Highscore::Keyword.new('the', 10)
|
12
13
|
end
|
13
14
|
|
14
15
|
def test_init
|
@@ -20,10 +21,16 @@ class TestKeywords < Test::Unit::TestCase
|
|
20
21
|
|
21
22
|
ranked = @keywords.rank
|
22
23
|
|
23
|
-
|
24
|
+
ranked_texts = []
|
25
|
+
ranked.each do |keyword|
|
26
|
+
assert(keyword.instance_of?(Highscore::Keyword),
|
27
|
+
"keywords must be instances of Highscore::Keyword, #{keyword.class} given")
|
28
|
+
ranked_texts << keyword.text
|
29
|
+
end
|
24
30
|
|
25
|
-
should_rank =
|
26
|
-
|
31
|
+
should_rank = %w{the Sinatra Ruby Highscore}
|
32
|
+
|
33
|
+
assert_equal should_rank, ranked_texts
|
27
34
|
end
|
28
35
|
|
29
36
|
def test_rank_empty
|
@@ -31,18 +38,13 @@ class TestKeywords < Test::Unit::TestCase
|
|
31
38
|
end
|
32
39
|
|
33
40
|
def test_top
|
34
|
-
|
41
|
+
top = @keywords.top(1)
|
42
|
+
|
43
|
+
assert_equal('the', top[0].text)
|
44
|
+
assert_equal(10.0, top[0].weight)
|
35
45
|
end
|
36
46
|
|
37
47
|
def test_top_empty
|
38
48
|
assert_equal [], Highscore::Keywords.new.top(0)
|
39
49
|
end
|
40
|
-
|
41
|
-
def test_sort
|
42
|
-
keywords = Highscore::Keywords.new
|
43
|
-
keywords['Test'] = 1
|
44
|
-
keywords['Foobar'] = 2
|
45
|
-
|
46
|
-
assert_equal [['Foobar', 2], ['Test', 1]], keywords.sort_it
|
47
|
-
end
|
48
50
|
end
|
data/version.txt
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.1
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: highscore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-01-
|
12
|
+
date: 2012-01-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bones
|
16
|
-
requirement: &
|
16
|
+
requirement: &70191947101320 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 3.7.3
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70191947101320
|
25
25
|
description: Rank keywords in long texts.
|
26
26
|
email: liebler.dominik@googlemail.com
|
27
27
|
executables:
|
@@ -36,10 +36,13 @@ files:
|
|
36
36
|
- README.md
|
37
37
|
- Rakefile
|
38
38
|
- bin/highscore
|
39
|
+
- highscore.gemspec
|
39
40
|
- lib/highscore.rb
|
40
41
|
- lib/highscore/content.rb
|
42
|
+
- lib/highscore/keyword.rb
|
41
43
|
- lib/highscore/keywords.rb
|
42
44
|
- test/highscore/test_content.rb
|
45
|
+
- test/highscore/test_keyword.rb
|
43
46
|
- test/highscore/test_keywords.rb
|
44
47
|
- test/test_highscore.rb
|
45
48
|
- version.txt
|
@@ -71,5 +74,6 @@ specification_version: 3
|
|
71
74
|
summary: Rank keywords in long texts.
|
72
75
|
test_files:
|
73
76
|
- test/highscore/test_content.rb
|
77
|
+
- test/highscore/test_keyword.rb
|
74
78
|
- test/highscore/test_keywords.rb
|
75
79
|
- test/test_highscore.rb
|