highscore 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +4 -2
- data/History.txt +6 -0
- data/README.md +13 -3
- data/Rakefile +1 -1
- data/bin/highscore +11 -4
- data/lib/highscore/content.rb +46 -3
- data/lib/highscore/keywords.rb +3 -3
- data/test/highscore/test_content.rb +35 -0
- data/version.txt +1 -1
- metadata +6 -6
data/.travis.yml
CHANGED
@@ -6,5 +6,7 @@ rvm:
|
|
6
6
|
- jruby-18mode # JRuby in 1.8 mode
|
7
7
|
- jruby-19mode # JRuby in 1.9 mode
|
8
8
|
- rbx-18mode
|
9
|
-
|
10
|
-
|
9
|
+
- rbx-19mode
|
10
|
+
|
11
|
+
# fast-stemmer is a C extension, so it won't work with JRuby ...
|
12
|
+
before_install: gem install bones; (gem install fast-stemmer &); sleep 10
|
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
== 0.5.2 / 2012-02-25
|
2
|
+
|
3
|
+
* added (optional) support for word stemming using the fast-stemmer gem
|
4
|
+
* setting to enable/disable case-sensitivy (thanks to carlosramireziii)
|
5
|
+
* use other regex patterns to cut the text in keywords (thanks to carlosramireziii)
|
6
|
+
|
1
7
|
== 0.5.0 / 2012-02-20
|
2
8
|
|
3
9
|
* now supports a whitelist approach
|
data/README.md
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
highscore
|
2
2
|
===========
|
3
3
|
|
4
|
-
|
4
|
+
Easily find and rank keywords in long texts.
|
5
|
+
|
6
|
+
[](http://travis-ci.org/domnikl/highscore)
|
5
7
|
|
6
8
|
Features
|
7
9
|
--------
|
@@ -11,6 +13,7 @@ Features
|
|
11
13
|
* directly get keywords from String objects
|
12
14
|
* blacklist words via a plain text file, String or an Array of words
|
13
15
|
* optionally, configure a whitelist and only words from that list will get ranked
|
16
|
+
* use word stemming (requires the fast-stemmer gem, doesn't work on JRuby platforms!)
|
14
17
|
* merge together Keywords from multiple sources
|
15
18
|
* contains a CLI tool that operates on STDIN/OUT and is configurable via parameters
|
16
19
|
|
@@ -24,8 +27,11 @@ text.configure do
|
|
24
27
|
set :upper_case, 3
|
25
28
|
set :long_words, 2
|
26
29
|
set :long_words_threshold, 15
|
27
|
-
set :vowels, 1
|
28
|
-
set :consonants, 5
|
30
|
+
set :vowels, 1 # => default: 0 = not considered
|
31
|
+
set :consonants, 5 # => default: 0 = not considered
|
32
|
+
set :ignore_case, true # => default: false
|
33
|
+
set :word_pattern, /[\w]+[^\s0-9]/ # => default: /\w+/
|
34
|
+
set :stemming, true # => default: false
|
29
35
|
end
|
30
36
|
|
31
37
|
# get only the top 50 keywords
|
@@ -94,6 +100,10 @@ Install
|
|
94
100
|
|
95
101
|
* `[sudo] gem install highscore`
|
96
102
|
|
103
|
+
To use word stemming, you need to have the fast-stemmer gem installed:
|
104
|
+
|
105
|
+
* `[sudo] gem install fast-stemmer`
|
106
|
+
|
97
107
|
Author
|
98
108
|
------
|
99
109
|
|
data/Rakefile
CHANGED
data/bin/highscore
CHANGED
@@ -31,14 +31,16 @@ optparse = OptionParser.new do |opts|
|
|
31
31
|
options[:emphasis][:multiplier] = multiplier.to_f
|
32
32
|
end
|
33
33
|
|
34
|
-
# don't print rank weight
|
35
34
|
opts.on('--no-ignore-short', 'don\'t ignore short words (<= 2 chars)') do
|
36
35
|
options[:emphasis][:ignore_short_words] = false
|
37
36
|
end
|
38
37
|
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
opts.on('--ignore-case', 'case insensitive') do
|
39
|
+
options[:emphasis][:ignore_case] = true
|
40
|
+
end
|
41
|
+
|
42
|
+
opts.on('--stemming', 'use word stemming (won\'t work on JRuby platforms)') do
|
43
|
+
options[:emphasis][:stemming] = true
|
42
44
|
end
|
43
45
|
|
44
46
|
# upper case
|
@@ -69,6 +71,11 @@ optparse = OptionParser.new do |opts|
|
|
69
71
|
opts.on('-n', '--top N', 'show only the top N keywords') do |u|
|
70
72
|
options[:top] = u.to_i
|
71
73
|
end
|
74
|
+
|
75
|
+
# don't print rank weight
|
76
|
+
opts.on('-s', '--short', 'don\'t print rank weight') do
|
77
|
+
options[:short] = true
|
78
|
+
end
|
72
79
|
end
|
73
80
|
|
74
81
|
optparse.parse!
|
data/lib/highscore/content.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
-
$:.unshift(File.
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
2
|
require 'keywords'
|
3
3
|
|
4
|
+
# external gems
|
5
|
+
require 'rubygems'
|
6
|
+
|
4
7
|
module Highscore
|
5
8
|
class Content
|
6
9
|
attr_reader :content
|
@@ -26,7 +29,10 @@ module Highscore
|
|
26
29
|
:long_words_threshold => 15,
|
27
30
|
:vowels => 0,
|
28
31
|
:consonants => 0,
|
29
|
-
:ignore_short_words => true
|
32
|
+
:ignore_short_words => true,
|
33
|
+
:ignore_case => false,
|
34
|
+
:word_pattern => /\w+/,
|
35
|
+
:stemming => false
|
30
36
|
}
|
31
37
|
end
|
32
38
|
|
@@ -49,10 +55,13 @@ module Highscore
|
|
49
55
|
#
|
50
56
|
# @return Highscore::Keywords
|
51
57
|
def keywords
|
58
|
+
@emphasis[:stemming] = use_stemming?
|
59
|
+
|
52
60
|
keywords = Keywords.new
|
53
61
|
|
54
|
-
Keywords.find_keywords(
|
62
|
+
Keywords.find_keywords(processed_content, wordlist, word_pattern).each do |text|
|
55
63
|
text = text.to_s
|
64
|
+
text = text.stem if @emphasis[:stemming]
|
56
65
|
|
57
66
|
if not (text.match(/^[\d]+(\.[\d]+){0,1}$/) or text.length <= 2)
|
58
67
|
keywords << Highscore::Keyword.new(text, weight(text))
|
@@ -77,6 +86,16 @@ module Highscore
|
|
77
86
|
|
78
87
|
private
|
79
88
|
|
89
|
+
# processes the text content applying any necessary transformations
|
90
|
+
#
|
91
|
+
# @return String
|
92
|
+
def processed_content
|
93
|
+
"".tap do |result|
|
94
|
+
result.replace(@content) # initialize the result to be @content
|
95
|
+
result.replace(result.downcase) if @emphasis[:ignore_case]
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
80
99
|
# allow short words to be rated
|
81
100
|
#
|
82
101
|
# @return TrueClass FalseClass
|
@@ -84,6 +103,13 @@ module Highscore
|
|
84
103
|
not @emphasis[:ignore_short_words]
|
85
104
|
end
|
86
105
|
|
106
|
+
# regex used to split text
|
107
|
+
#
|
108
|
+
# @return Regex
|
109
|
+
def word_pattern
|
110
|
+
@emphasis[:word_pattern]
|
111
|
+
end
|
112
|
+
|
87
113
|
# weight a single text keyword
|
88
114
|
#
|
89
115
|
# @param text String
|
@@ -121,5 +147,22 @@ module Highscore
|
|
121
147
|
percent = text.consonants.length / text.length.to_f
|
122
148
|
percent * @emphasis[:consonants]
|
123
149
|
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
153
|
+
# using stemming is only possible, if fast-stemmer is installed
|
154
|
+
# doesn't work for JRuby
|
155
|
+
def use_stemming?
|
156
|
+
if @emphasis[:stemming]
|
157
|
+
begin
|
158
|
+
require 'fast_stemmer'
|
159
|
+
true
|
160
|
+
rescue LoadError
|
161
|
+
false
|
162
|
+
end
|
163
|
+
else
|
164
|
+
false
|
165
|
+
end
|
166
|
+
end
|
124
167
|
end
|
125
168
|
end
|
data/lib/highscore/keywords.rb
CHANGED
@@ -12,10 +12,10 @@ module Highscore
|
|
12
12
|
#
|
13
13
|
# @param content String
|
14
14
|
# @param wordlist Highscore::Wordlist
|
15
|
+
# @param pattern Regex
|
15
16
|
# @return Highscore::Keywords
|
16
|
-
def self.find_keywords content, wordlist
|
17
|
-
keywords = content.to_s.scan(
|
18
|
-
|
17
|
+
def self.find_keywords content, wordlist, pattern=/\w+/
|
18
|
+
keywords = content.to_s.scan(pattern)
|
19
19
|
keywords.delete_if do |key, value|
|
20
20
|
if wordlist.kind_of? Highscore::Blacklist
|
21
21
|
wordlist.include?(key.downcase)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
$:.unshift(File.join(File.dirname(__FILE__), %w{.. .. lib highscore}))
|
2
2
|
require "content"
|
3
3
|
require "test/unit"
|
4
|
+
require 'rubygems'
|
4
5
|
|
5
6
|
class TestContent < Test::Unit::TestCase
|
6
7
|
def setup
|
@@ -50,4 +51,38 @@ class TestContent < Test::Unit::TestCase
|
|
50
51
|
|
51
52
|
assert_equal 4, keywords.length
|
52
53
|
end
|
54
|
+
|
55
|
+
def test_word_pattern
|
56
|
+
keywords = 'foo Ruby foo Ruby'.keywords do
|
57
|
+
set :word_pattern, /(?=(\b\w+\s\w+\b))/
|
58
|
+
end
|
59
|
+
|
60
|
+
assert_equal 2, keywords.length
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_ignore_case
|
64
|
+
keywords = 'foo Foo bar Bar'.keywords do
|
65
|
+
set :ignore_case, true
|
66
|
+
end
|
67
|
+
|
68
|
+
assert_equal 2, keywords.length
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_stemming
|
72
|
+
begin
|
73
|
+
require 'fast_stemmer'
|
74
|
+
|
75
|
+
keywords = 'word words boards board woerter wort'.keywords do
|
76
|
+
set :stemming, true
|
77
|
+
end
|
78
|
+
|
79
|
+
assert_equal 4, keywords.length
|
80
|
+
|
81
|
+
keywords.each do |k|
|
82
|
+
assert (%w{board word woerter wort}).include?(k.text)
|
83
|
+
end
|
84
|
+
rescue LoadError
|
85
|
+
# do nothing, just skip this test
|
86
|
+
end
|
87
|
+
end
|
53
88
|
end
|
data/version.txt
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.2
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: highscore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bones
|
16
|
-
requirement: &
|
16
|
+
requirement: &70144626889800 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,8 +21,8 @@ dependencies:
|
|
21
21
|
version: 3.7.3
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
25
|
-
description:
|
24
|
+
version_requirements: *70144626889800
|
25
|
+
description: Easily find and rank keywords in long texts.
|
26
26
|
email: liebler.dominik@googlemail.com
|
27
27
|
executables:
|
28
28
|
- highscore
|
@@ -83,7 +83,7 @@ rubyforge_project: highscore
|
|
83
83
|
rubygems_version: 1.8.16
|
84
84
|
signing_key:
|
85
85
|
specification_version: 3
|
86
|
-
summary:
|
86
|
+
summary: Easily find and rank keywords in long texts.
|
87
87
|
test_files:
|
88
88
|
- test/highscore/test_blacklist.rb
|
89
89
|
- test/highscore/test_content.rb
|