highscore 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.travis.yml CHANGED
@@ -6,5 +6,7 @@ rvm:
6
6
  - jruby-18mode # JRuby in 1.8 mode
7
7
  - jruby-19mode # JRuby in 1.9 mode
8
8
  - rbx-18mode
9
- # - rbx-19mode # currently in active development, may or may not work for your project
10
- before_install: gem install bones
9
+ - rbx-19mode
10
+
11
+ # fast-stemmer is a C extension, so it won't work with JRuby ...
12
+ before_install: gem install bones; (gem install fast-stemmer &); sleep 10
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ == 0.5.2 / 2012-02-25
2
+
3
+ * added (optional) support for word stemming using the fast-stemmer gem
4
+ * setting to enable/disable case-sensitivy (thanks to carlosramireziii)
5
+ * use other regex patterns to cut the text in keywords (thanks to carlosramireziii)
6
+
1
7
  == 0.5.0 / 2012-02-20
2
8
 
3
9
  * now supports a whitelist approach
data/README.md CHANGED
@@ -1,7 +1,9 @@
1
1
  highscore
2
2
  ===========
3
3
 
4
- Find and rank keywords in long texts.
4
+ Easily find and rank keywords in long texts.
5
+
6
+ [![Build Status](https://secure.travis-ci.org/domnikl/highscore.png?branch=develop)](http://travis-ci.org/domnikl/highscore)
5
7
 
6
8
  Features
7
9
  --------
@@ -11,6 +13,7 @@ Features
11
13
  * directly get keywords from String objects
12
14
  * blacklist words via a plain text file, String or an Array of words
13
15
  * optionally, configure a whitelist and only words from that list will get ranked
16
+ * use word stemming (requires the fast-stemmer gem, doesn't work on JRuby platforms!)
14
17
  * merge together Keywords from multiple sources
15
18
  * contains a CLI tool that operates on STDIN/OUT and is configurable via parameters
16
19
 
@@ -24,8 +27,11 @@ text.configure do
24
27
  set :upper_case, 3
25
28
  set :long_words, 2
26
29
  set :long_words_threshold, 15
27
- set :vowels, 1 # => default = 0 = not considered
28
- set :consonants, 5 # => default = 0 = not considered
30
+ set :vowels, 1 # => default: 0 = not considered
31
+ set :consonants, 5 # => default: 0 = not considered
32
+ set :ignore_case, true # => default: false
33
+ set :word_pattern, /[\w]+[^\s0-9]/ # => default: /\w+/
34
+ set :stemming, true # => default: false
29
35
  end
30
36
 
31
37
  # get only the top 50 keywords
@@ -94,6 +100,10 @@ Install
94
100
 
95
101
  * `[sudo] gem install highscore`
96
102
 
103
+ To use word stemming, you need to have the fast-stemmer gem installed:
104
+
105
+ * `[sudo] gem install fast-stemmer`
106
+
97
107
  Author
98
108
  ------
99
109
 
data/Rakefile CHANGED
@@ -1,3 +1,4 @@
1
+ require 'rubygems'
1
2
 
2
3
  begin
3
4
  require 'bones'
@@ -15,4 +16,3 @@ Bones {
15
16
  url 'http://thewebdev.de'
16
17
  ignore_file '.gitignore'
17
18
  }
18
-
data/bin/highscore CHANGED
@@ -31,14 +31,16 @@ optparse = OptionParser.new do |opts|
31
31
  options[:emphasis][:multiplier] = multiplier.to_f
32
32
  end
33
33
 
34
- # don't print rank weight
35
34
  opts.on('--no-ignore-short', 'don\'t ignore short words (<= 2 chars)') do
36
35
  options[:emphasis][:ignore_short_words] = false
37
36
  end
38
37
 
39
- # don't print rank weight
40
- opts.on('-s', '--short', 'don\'t print rank weight') do
41
- options[:short] = true
38
+ opts.on('--ignore-case', 'case insensitive') do
39
+ options[:emphasis][:ignore_case] = true
40
+ end
41
+
42
+ opts.on('--stemming', 'use word stemming (won\'t work on JRuby platforms)') do
43
+ options[:emphasis][:stemming] = true
42
44
  end
43
45
 
44
46
  # upper case
@@ -69,6 +71,11 @@ optparse = OptionParser.new do |opts|
69
71
  opts.on('-n', '--top N', 'show only the top N keywords') do |u|
70
72
  options[:top] = u.to_i
71
73
  end
74
+
75
+ # don't print rank weight
76
+ opts.on('-s', '--short', 'don\'t print rank weight') do
77
+ options[:short] = true
78
+ end
72
79
  end
73
80
 
74
81
  optparse.parse!
@@ -1,6 +1,9 @@
1
- $:.unshift(File.join(File.dirname(__FILE__)))
1
+ $:.unshift(File.dirname(__FILE__))
2
2
  require 'keywords'
3
3
 
4
+ # external gems
5
+ require 'rubygems'
6
+
4
7
  module Highscore
5
8
  class Content
6
9
  attr_reader :content
@@ -26,7 +29,10 @@ module Highscore
26
29
  :long_words_threshold => 15,
27
30
  :vowels => 0,
28
31
  :consonants => 0,
29
- :ignore_short_words => true
32
+ :ignore_short_words => true,
33
+ :ignore_case => false,
34
+ :word_pattern => /\w+/,
35
+ :stemming => false
30
36
  }
31
37
  end
32
38
 
@@ -49,10 +55,13 @@ module Highscore
49
55
  #
50
56
  # @return Highscore::Keywords
51
57
  def keywords
58
+ @emphasis[:stemming] = use_stemming?
59
+
52
60
  keywords = Keywords.new
53
61
 
54
- Keywords.find_keywords(@content, wordlist).each do |text|
62
+ Keywords.find_keywords(processed_content, wordlist, word_pattern).each do |text|
55
63
  text = text.to_s
64
+ text = text.stem if @emphasis[:stemming]
56
65
 
57
66
  if not (text.match(/^[\d]+(\.[\d]+){0,1}$/) or text.length <= 2)
58
67
  keywords << Highscore::Keyword.new(text, weight(text))
@@ -77,6 +86,16 @@ module Highscore
77
86
 
78
87
  private
79
88
 
89
+ # processes the text content applying any necessary transformations
90
+ #
91
+ # @return String
92
+ def processed_content
93
+ "".tap do |result|
94
+ result.replace(@content) # initialize the result to be @content
95
+ result.replace(result.downcase) if @emphasis[:ignore_case]
96
+ end
97
+ end
98
+
80
99
  # allow short words to be rated
81
100
  #
82
101
  # @return TrueClass FalseClass
@@ -84,6 +103,13 @@ module Highscore
84
103
  not @emphasis[:ignore_short_words]
85
104
  end
86
105
 
106
+ # regex used to split text
107
+ #
108
+ # @return Regex
109
+ def word_pattern
110
+ @emphasis[:word_pattern]
111
+ end
112
+
87
113
  # weight a single text keyword
88
114
  #
89
115
  # @param text String
@@ -121,5 +147,22 @@ module Highscore
121
147
  percent = text.consonants.length / text.length.to_f
122
148
  percent * @emphasis[:consonants]
123
149
  end
150
+
151
+ private
152
+
153
+ # using stemming is only possible, if fast-stemmer is installed
154
+ # doesn't work for JRuby
155
+ def use_stemming?
156
+ if @emphasis[:stemming]
157
+ begin
158
+ require 'fast_stemmer'
159
+ true
160
+ rescue LoadError
161
+ false
162
+ end
163
+ else
164
+ false
165
+ end
166
+ end
124
167
  end
125
168
  end
@@ -12,10 +12,10 @@ module Highscore
12
12
  #
13
13
  # @param content String
14
14
  # @param wordlist Highscore::Wordlist
15
+ # @param pattern Regex
15
16
  # @return Highscore::Keywords
16
- def self.find_keywords content, wordlist
17
- keywords = content.to_s.scan(/\w+/)
18
-
17
+ def self.find_keywords content, wordlist, pattern=/\w+/
18
+ keywords = content.to_s.scan(pattern)
19
19
  keywords.delete_if do |key, value|
20
20
  if wordlist.kind_of? Highscore::Blacklist
21
21
  wordlist.include?(key.downcase)
@@ -1,6 +1,7 @@
1
1
  $:.unshift(File.join(File.dirname(__FILE__), %w{.. .. lib highscore}))
2
2
  require "content"
3
3
  require "test/unit"
4
+ require 'rubygems'
4
5
 
5
6
  class TestContent < Test::Unit::TestCase
6
7
  def setup
@@ -50,4 +51,38 @@ class TestContent < Test::Unit::TestCase
50
51
 
51
52
  assert_equal 4, keywords.length
52
53
  end
54
+
55
+ def test_word_pattern
56
+ keywords = 'foo Ruby foo Ruby'.keywords do
57
+ set :word_pattern, /(?=(\b\w+\s\w+\b))/
58
+ end
59
+
60
+ assert_equal 2, keywords.length
61
+ end
62
+
63
+ def test_ignore_case
64
+ keywords = 'foo Foo bar Bar'.keywords do
65
+ set :ignore_case, true
66
+ end
67
+
68
+ assert_equal 2, keywords.length
69
+ end
70
+
71
+ def test_stemming
72
+ begin
73
+ require 'fast_stemmer'
74
+
75
+ keywords = 'word words boards board woerter wort'.keywords do
76
+ set :stemming, true
77
+ end
78
+
79
+ assert_equal 4, keywords.length
80
+
81
+ keywords.each do |k|
82
+ assert (%w{board word woerter wort}).include?(k.text)
83
+ end
84
+ rescue LoadError
85
+ # do nothing, just skip this test
86
+ end
87
+ end
53
88
  end
data/version.txt CHANGED
@@ -1 +1 @@
1
- 0.5.0
1
+ 0.5.2
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: highscore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-20 00:00:00.000000000 Z
12
+ date: 2012-02-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bones
16
- requirement: &70127917245680 !ruby/object:Gem::Requirement
16
+ requirement: &70144626889800 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,8 +21,8 @@ dependencies:
21
21
  version: 3.7.3
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70127917245680
25
- description: Find and rank keywords in long texts.
24
+ version_requirements: *70144626889800
25
+ description: Easily find and rank keywords in long texts.
26
26
  email: liebler.dominik@googlemail.com
27
27
  executables:
28
28
  - highscore
@@ -83,7 +83,7 @@ rubyforge_project: highscore
83
83
  rubygems_version: 1.8.16
84
84
  signing_key:
85
85
  specification_version: 3
86
- summary: Find and rank keywords in long texts.
86
+ summary: Easily find and rank keywords in long texts.
87
87
  test_files:
88
88
  - test/highscore/test_blacklist.rb
89
89
  - test/highscore/test_content.rb