highscore 0.5.0 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +4 -2
- data/History.txt +6 -0
- data/README.md +13 -3
- data/Rakefile +1 -1
- data/bin/highscore +11 -4
- data/lib/highscore/content.rb +46 -3
- data/lib/highscore/keywords.rb +3 -3
- data/test/highscore/test_content.rb +35 -0
- data/version.txt +1 -1
- metadata +6 -6
data/.travis.yml
CHANGED
@@ -6,5 +6,7 @@ rvm:
|
|
6
6
|
- jruby-18mode # JRuby in 1.8 mode
|
7
7
|
- jruby-19mode # JRuby in 1.9 mode
|
8
8
|
- rbx-18mode
|
9
|
-
|
10
|
-
|
9
|
+
- rbx-19mode
|
10
|
+
|
11
|
+
# fast-stemmer is a C extension, so it won't work with JRuby ...
|
12
|
+
before_install: gem install bones; (gem install fast-stemmer &); sleep 10
|
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
== 0.5.2 / 2012-02-25
|
2
|
+
|
3
|
+
* added (optional) support for word stemming using the fast-stemmer gem
|
4
|
+
* setting to enable/disable case-sensitivy (thanks to carlosramireziii)
|
5
|
+
* use other regex patterns to cut the text in keywords (thanks to carlosramireziii)
|
6
|
+
|
1
7
|
== 0.5.0 / 2012-02-20
|
2
8
|
|
3
9
|
* now supports a whitelist approach
|
data/README.md
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
highscore
|
2
2
|
===========
|
3
3
|
|
4
|
-
|
4
|
+
Easily find and rank keywords in long texts.
|
5
|
+
|
6
|
+
[![Build Status](https://secure.travis-ci.org/domnikl/highscore.png?branch=develop)](http://travis-ci.org/domnikl/highscore)
|
5
7
|
|
6
8
|
Features
|
7
9
|
--------
|
@@ -11,6 +13,7 @@ Features
|
|
11
13
|
* directly get keywords from String objects
|
12
14
|
* blacklist words via a plain text file, String or an Array of words
|
13
15
|
* optionally, configure a whitelist and only words from that list will get ranked
|
16
|
+
* use word stemming (requires the fast-stemmer gem, doesn't work on JRuby platforms!)
|
14
17
|
* merge together Keywords from multiple sources
|
15
18
|
* contains a CLI tool that operates on STDIN/OUT and is configurable via parameters
|
16
19
|
|
@@ -24,8 +27,11 @@ text.configure do
|
|
24
27
|
set :upper_case, 3
|
25
28
|
set :long_words, 2
|
26
29
|
set :long_words_threshold, 15
|
27
|
-
set :vowels, 1
|
28
|
-
set :consonants, 5
|
30
|
+
set :vowels, 1 # => default: 0 = not considered
|
31
|
+
set :consonants, 5 # => default: 0 = not considered
|
32
|
+
set :ignore_case, true # => default: false
|
33
|
+
set :word_pattern, /[\w]+[^\s0-9]/ # => default: /\w+/
|
34
|
+
set :stemming, true # => default: false
|
29
35
|
end
|
30
36
|
|
31
37
|
# get only the top 50 keywords
|
@@ -94,6 +100,10 @@ Install
|
|
94
100
|
|
95
101
|
* `[sudo] gem install highscore`
|
96
102
|
|
103
|
+
To use word stemming, you need to have the fast-stemmer gem installed:
|
104
|
+
|
105
|
+
* `[sudo] gem install fast-stemmer`
|
106
|
+
|
97
107
|
Author
|
98
108
|
------
|
99
109
|
|
data/Rakefile
CHANGED
data/bin/highscore
CHANGED
@@ -31,14 +31,16 @@ optparse = OptionParser.new do |opts|
|
|
31
31
|
options[:emphasis][:multiplier] = multiplier.to_f
|
32
32
|
end
|
33
33
|
|
34
|
-
# don't print rank weight
|
35
34
|
opts.on('--no-ignore-short', 'don\'t ignore short words (<= 2 chars)') do
|
36
35
|
options[:emphasis][:ignore_short_words] = false
|
37
36
|
end
|
38
37
|
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
opts.on('--ignore-case', 'case insensitive') do
|
39
|
+
options[:emphasis][:ignore_case] = true
|
40
|
+
end
|
41
|
+
|
42
|
+
opts.on('--stemming', 'use word stemming (won\'t work on JRuby platforms)') do
|
43
|
+
options[:emphasis][:stemming] = true
|
42
44
|
end
|
43
45
|
|
44
46
|
# upper case
|
@@ -69,6 +71,11 @@ optparse = OptionParser.new do |opts|
|
|
69
71
|
opts.on('-n', '--top N', 'show only the top N keywords') do |u|
|
70
72
|
options[:top] = u.to_i
|
71
73
|
end
|
74
|
+
|
75
|
+
# don't print rank weight
|
76
|
+
opts.on('-s', '--short', 'don\'t print rank weight') do
|
77
|
+
options[:short] = true
|
78
|
+
end
|
72
79
|
end
|
73
80
|
|
74
81
|
optparse.parse!
|
data/lib/highscore/content.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
|
-
$:.unshift(File.
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
2
|
require 'keywords'
|
3
3
|
|
4
|
+
# external gems
|
5
|
+
require 'rubygems'
|
6
|
+
|
4
7
|
module Highscore
|
5
8
|
class Content
|
6
9
|
attr_reader :content
|
@@ -26,7 +29,10 @@ module Highscore
|
|
26
29
|
:long_words_threshold => 15,
|
27
30
|
:vowels => 0,
|
28
31
|
:consonants => 0,
|
29
|
-
:ignore_short_words => true
|
32
|
+
:ignore_short_words => true,
|
33
|
+
:ignore_case => false,
|
34
|
+
:word_pattern => /\w+/,
|
35
|
+
:stemming => false
|
30
36
|
}
|
31
37
|
end
|
32
38
|
|
@@ -49,10 +55,13 @@ module Highscore
|
|
49
55
|
#
|
50
56
|
# @return Highscore::Keywords
|
51
57
|
def keywords
|
58
|
+
@emphasis[:stemming] = use_stemming?
|
59
|
+
|
52
60
|
keywords = Keywords.new
|
53
61
|
|
54
|
-
Keywords.find_keywords(
|
62
|
+
Keywords.find_keywords(processed_content, wordlist, word_pattern).each do |text|
|
55
63
|
text = text.to_s
|
64
|
+
text = text.stem if @emphasis[:stemming]
|
56
65
|
|
57
66
|
if not (text.match(/^[\d]+(\.[\d]+){0,1}$/) or text.length <= 2)
|
58
67
|
keywords << Highscore::Keyword.new(text, weight(text))
|
@@ -77,6 +86,16 @@ module Highscore
|
|
77
86
|
|
78
87
|
private
|
79
88
|
|
89
|
+
# processes the text content applying any necessary transformations
|
90
|
+
#
|
91
|
+
# @return String
|
92
|
+
def processed_content
|
93
|
+
"".tap do |result|
|
94
|
+
result.replace(@content) # initialize the result to be @content
|
95
|
+
result.replace(result.downcase) if @emphasis[:ignore_case]
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
80
99
|
# allow short words to be rated
|
81
100
|
#
|
82
101
|
# @return TrueClass FalseClass
|
@@ -84,6 +103,13 @@ module Highscore
|
|
84
103
|
not @emphasis[:ignore_short_words]
|
85
104
|
end
|
86
105
|
|
106
|
+
# regex used to split text
|
107
|
+
#
|
108
|
+
# @return Regex
|
109
|
+
def word_pattern
|
110
|
+
@emphasis[:word_pattern]
|
111
|
+
end
|
112
|
+
|
87
113
|
# weight a single text keyword
|
88
114
|
#
|
89
115
|
# @param text String
|
@@ -121,5 +147,22 @@ module Highscore
|
|
121
147
|
percent = text.consonants.length / text.length.to_f
|
122
148
|
percent * @emphasis[:consonants]
|
123
149
|
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
153
|
+
# using stemming is only possible, if fast-stemmer is installed
|
154
|
+
# doesn't work for JRuby
|
155
|
+
def use_stemming?
|
156
|
+
if @emphasis[:stemming]
|
157
|
+
begin
|
158
|
+
require 'fast_stemmer'
|
159
|
+
true
|
160
|
+
rescue LoadError
|
161
|
+
false
|
162
|
+
end
|
163
|
+
else
|
164
|
+
false
|
165
|
+
end
|
166
|
+
end
|
124
167
|
end
|
125
168
|
end
|
data/lib/highscore/keywords.rb
CHANGED
@@ -12,10 +12,10 @@ module Highscore
|
|
12
12
|
#
|
13
13
|
# @param content String
|
14
14
|
# @param wordlist Highscore::Wordlist
|
15
|
+
# @param pattern Regex
|
15
16
|
# @return Highscore::Keywords
|
16
|
-
def self.find_keywords content, wordlist
|
17
|
-
keywords = content.to_s.scan(
|
18
|
-
|
17
|
+
def self.find_keywords content, wordlist, pattern=/\w+/
|
18
|
+
keywords = content.to_s.scan(pattern)
|
19
19
|
keywords.delete_if do |key, value|
|
20
20
|
if wordlist.kind_of? Highscore::Blacklist
|
21
21
|
wordlist.include?(key.downcase)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
$:.unshift(File.join(File.dirname(__FILE__), %w{.. .. lib highscore}))
|
2
2
|
require "content"
|
3
3
|
require "test/unit"
|
4
|
+
require 'rubygems'
|
4
5
|
|
5
6
|
class TestContent < Test::Unit::TestCase
|
6
7
|
def setup
|
@@ -50,4 +51,38 @@ class TestContent < Test::Unit::TestCase
|
|
50
51
|
|
51
52
|
assert_equal 4, keywords.length
|
52
53
|
end
|
54
|
+
|
55
|
+
def test_word_pattern
|
56
|
+
keywords = 'foo Ruby foo Ruby'.keywords do
|
57
|
+
set :word_pattern, /(?=(\b\w+\s\w+\b))/
|
58
|
+
end
|
59
|
+
|
60
|
+
assert_equal 2, keywords.length
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_ignore_case
|
64
|
+
keywords = 'foo Foo bar Bar'.keywords do
|
65
|
+
set :ignore_case, true
|
66
|
+
end
|
67
|
+
|
68
|
+
assert_equal 2, keywords.length
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_stemming
|
72
|
+
begin
|
73
|
+
require 'fast_stemmer'
|
74
|
+
|
75
|
+
keywords = 'word words boards board woerter wort'.keywords do
|
76
|
+
set :stemming, true
|
77
|
+
end
|
78
|
+
|
79
|
+
assert_equal 4, keywords.length
|
80
|
+
|
81
|
+
keywords.each do |k|
|
82
|
+
assert (%w{board word woerter wort}).include?(k.text)
|
83
|
+
end
|
84
|
+
rescue LoadError
|
85
|
+
# do nothing, just skip this test
|
86
|
+
end
|
87
|
+
end
|
53
88
|
end
|
data/version.txt
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.2
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: highscore
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-25 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bones
|
16
|
-
requirement: &
|
16
|
+
requirement: &70144626889800 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,8 +21,8 @@ dependencies:
|
|
21
21
|
version: 3.7.3
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
25
|
-
description:
|
24
|
+
version_requirements: *70144626889800
|
25
|
+
description: Easily find and rank keywords in long texts.
|
26
26
|
email: liebler.dominik@googlemail.com
|
27
27
|
executables:
|
28
28
|
- highscore
|
@@ -83,7 +83,7 @@ rubyforge_project: highscore
|
|
83
83
|
rubygems_version: 1.8.16
|
84
84
|
signing_key:
|
85
85
|
specification_version: 3
|
86
|
-
summary:
|
86
|
+
summary: Easily find and rank keywords in long texts.
|
87
87
|
test_files:
|
88
88
|
- test/highscore/test_blacklist.rb
|
89
89
|
- test/highscore/test_content.rb
|