highscore 0.5.0 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
data/.travis.yml CHANGED
@@ -6,5 +6,7 @@ rvm:
6
6
  - jruby-18mode # JRuby in 1.8 mode
7
7
  - jruby-19mode # JRuby in 1.9 mode
8
8
  - rbx-18mode
9
- # - rbx-19mode # currently in active development, may or may not work for your project
10
- before_install: gem install bones
9
+ - rbx-19mode
10
+
11
+ # fast-stemmer is a C extension, so it won't work with JRuby ...
12
+ before_install: gem install bones; (gem install fast-stemmer &); sleep 10
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ == 0.5.2 / 2012-02-25
2
+
3
+ * added (optional) support for word stemming using the fast-stemmer gem
4
+ * setting to enable/disable case-sensitivy (thanks to carlosramireziii)
5
+ * use other regex patterns to cut the text in keywords (thanks to carlosramireziii)
6
+
1
7
  == 0.5.0 / 2012-02-20
2
8
 
3
9
  * now supports a whitelist approach
data/README.md CHANGED
@@ -1,7 +1,9 @@
1
1
  highscore
2
2
  ===========
3
3
 
4
- Find and rank keywords in long texts.
4
+ Easily find and rank keywords in long texts.
5
+
6
+ [![Build Status](https://secure.travis-ci.org/domnikl/highscore.png?branch=develop)](http://travis-ci.org/domnikl/highscore)
5
7
 
6
8
  Features
7
9
  --------
@@ -11,6 +13,7 @@ Features
11
13
  * directly get keywords from String objects
12
14
  * blacklist words via a plain text file, String or an Array of words
13
15
  * optionally, configure a whitelist and only words from that list will get ranked
16
+ * use word stemming (requires the fast-stemmer gem, doesn't work on JRuby platforms!)
14
17
  * merge together Keywords from multiple sources
15
18
  * contains a CLI tool that operates on STDIN/OUT and is configurable via parameters
16
19
 
@@ -24,8 +27,11 @@ text.configure do
24
27
  set :upper_case, 3
25
28
  set :long_words, 2
26
29
  set :long_words_threshold, 15
27
- set :vowels, 1 # => default = 0 = not considered
28
- set :consonants, 5 # => default = 0 = not considered
30
+ set :vowels, 1 # => default: 0 = not considered
31
+ set :consonants, 5 # => default: 0 = not considered
32
+ set :ignore_case, true # => default: false
33
+ set :word_pattern, /[\w]+[^\s0-9]/ # => default: /\w+/
34
+ set :stemming, true # => default: false
29
35
  end
30
36
 
31
37
  # get only the top 50 keywords
@@ -94,6 +100,10 @@ Install
94
100
 
95
101
  * `[sudo] gem install highscore`
96
102
 
103
+ To use word stemming, you need to have the fast-stemmer gem installed:
104
+
105
+ * `[sudo] gem install fast-stemmer`
106
+
97
107
  Author
98
108
  ------
99
109
 
data/Rakefile CHANGED
@@ -1,3 +1,4 @@
1
+ require 'rubygems'
1
2
 
2
3
  begin
3
4
  require 'bones'
@@ -15,4 +16,3 @@ Bones {
15
16
  url 'http://thewebdev.de'
16
17
  ignore_file '.gitignore'
17
18
  }
18
-
data/bin/highscore CHANGED
@@ -31,14 +31,16 @@ optparse = OptionParser.new do |opts|
31
31
  options[:emphasis][:multiplier] = multiplier.to_f
32
32
  end
33
33
 
34
- # don't print rank weight
35
34
  opts.on('--no-ignore-short', 'don\'t ignore short words (<= 2 chars)') do
36
35
  options[:emphasis][:ignore_short_words] = false
37
36
  end
38
37
 
39
- # don't print rank weight
40
- opts.on('-s', '--short', 'don\'t print rank weight') do
41
- options[:short] = true
38
+ opts.on('--ignore-case', 'case insensitive') do
39
+ options[:emphasis][:ignore_case] = true
40
+ end
41
+
42
+ opts.on('--stemming', 'use word stemming (won\'t work on JRuby platforms)') do
43
+ options[:emphasis][:stemming] = true
42
44
  end
43
45
 
44
46
  # upper case
@@ -69,6 +71,11 @@ optparse = OptionParser.new do |opts|
69
71
  opts.on('-n', '--top N', 'show only the top N keywords') do |u|
70
72
  options[:top] = u.to_i
71
73
  end
74
+
75
+ # don't print rank weight
76
+ opts.on('-s', '--short', 'don\'t print rank weight') do
77
+ options[:short] = true
78
+ end
72
79
  end
73
80
 
74
81
  optparse.parse!
@@ -1,6 +1,9 @@
1
- $:.unshift(File.join(File.dirname(__FILE__)))
1
+ $:.unshift(File.dirname(__FILE__))
2
2
  require 'keywords'
3
3
 
4
+ # external gems
5
+ require 'rubygems'
6
+
4
7
  module Highscore
5
8
  class Content
6
9
  attr_reader :content
@@ -26,7 +29,10 @@ module Highscore
26
29
  :long_words_threshold => 15,
27
30
  :vowels => 0,
28
31
  :consonants => 0,
29
- :ignore_short_words => true
32
+ :ignore_short_words => true,
33
+ :ignore_case => false,
34
+ :word_pattern => /\w+/,
35
+ :stemming => false
30
36
  }
31
37
  end
32
38
 
@@ -49,10 +55,13 @@ module Highscore
49
55
  #
50
56
  # @return Highscore::Keywords
51
57
  def keywords
58
+ @emphasis[:stemming] = use_stemming?
59
+
52
60
  keywords = Keywords.new
53
61
 
54
- Keywords.find_keywords(@content, wordlist).each do |text|
62
+ Keywords.find_keywords(processed_content, wordlist, word_pattern).each do |text|
55
63
  text = text.to_s
64
+ text = text.stem if @emphasis[:stemming]
56
65
 
57
66
  if not (text.match(/^[\d]+(\.[\d]+){0,1}$/) or text.length <= 2)
58
67
  keywords << Highscore::Keyword.new(text, weight(text))
@@ -77,6 +86,16 @@ module Highscore
77
86
 
78
87
  private
79
88
 
89
+ # processes the text content applying any necessary transformations
90
+ #
91
+ # @return String
92
+ def processed_content
93
+ "".tap do |result|
94
+ result.replace(@content) # initialize the result to be @content
95
+ result.replace(result.downcase) if @emphasis[:ignore_case]
96
+ end
97
+ end
98
+
80
99
  # allow short words to be rated
81
100
  #
82
101
  # @return TrueClass FalseClass
@@ -84,6 +103,13 @@ module Highscore
84
103
  not @emphasis[:ignore_short_words]
85
104
  end
86
105
 
106
+ # regex used to split text
107
+ #
108
+ # @return Regex
109
+ def word_pattern
110
+ @emphasis[:word_pattern]
111
+ end
112
+
87
113
  # weight a single text keyword
88
114
  #
89
115
  # @param text String
@@ -121,5 +147,22 @@ module Highscore
121
147
  percent = text.consonants.length / text.length.to_f
122
148
  percent * @emphasis[:consonants]
123
149
  end
150
+
151
+ private
152
+
153
+ # using stemming is only possible, if fast-stemmer is installed
154
+ # doesn't work for JRuby
155
+ def use_stemming?
156
+ if @emphasis[:stemming]
157
+ begin
158
+ require 'fast_stemmer'
159
+ true
160
+ rescue LoadError
161
+ false
162
+ end
163
+ else
164
+ false
165
+ end
166
+ end
124
167
  end
125
168
  end
@@ -12,10 +12,10 @@ module Highscore
12
12
  #
13
13
  # @param content String
14
14
  # @param wordlist Highscore::Wordlist
15
+ # @param pattern Regex
15
16
  # @return Highscore::Keywords
16
- def self.find_keywords content, wordlist
17
- keywords = content.to_s.scan(/\w+/)
18
-
17
+ def self.find_keywords content, wordlist, pattern=/\w+/
18
+ keywords = content.to_s.scan(pattern)
19
19
  keywords.delete_if do |key, value|
20
20
  if wordlist.kind_of? Highscore::Blacklist
21
21
  wordlist.include?(key.downcase)
@@ -1,6 +1,7 @@
1
1
  $:.unshift(File.join(File.dirname(__FILE__), %w{.. .. lib highscore}))
2
2
  require "content"
3
3
  require "test/unit"
4
+ require 'rubygems'
4
5
 
5
6
  class TestContent < Test::Unit::TestCase
6
7
  def setup
@@ -50,4 +51,38 @@ class TestContent < Test::Unit::TestCase
50
51
 
51
52
  assert_equal 4, keywords.length
52
53
  end
54
+
55
+ def test_word_pattern
56
+ keywords = 'foo Ruby foo Ruby'.keywords do
57
+ set :word_pattern, /(?=(\b\w+\s\w+\b))/
58
+ end
59
+
60
+ assert_equal 2, keywords.length
61
+ end
62
+
63
+ def test_ignore_case
64
+ keywords = 'foo Foo bar Bar'.keywords do
65
+ set :ignore_case, true
66
+ end
67
+
68
+ assert_equal 2, keywords.length
69
+ end
70
+
71
+ def test_stemming
72
+ begin
73
+ require 'fast_stemmer'
74
+
75
+ keywords = 'word words boards board woerter wort'.keywords do
76
+ set :stemming, true
77
+ end
78
+
79
+ assert_equal 4, keywords.length
80
+
81
+ keywords.each do |k|
82
+ assert (%w{board word woerter wort}).include?(k.text)
83
+ end
84
+ rescue LoadError
85
+ # do nothing, just skip this test
86
+ end
87
+ end
53
88
  end
data/version.txt CHANGED
@@ -1 +1 @@
1
- 0.5.0
1
+ 0.5.2
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: highscore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-20 00:00:00.000000000 Z
12
+ date: 2012-02-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bones
16
- requirement: &70127917245680 !ruby/object:Gem::Requirement
16
+ requirement: &70144626889800 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,8 +21,8 @@ dependencies:
21
21
  version: 3.7.3
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70127917245680
25
- description: Find and rank keywords in long texts.
24
+ version_requirements: *70144626889800
25
+ description: Easily find and rank keywords in long texts.
26
26
  email: liebler.dominik@googlemail.com
27
27
  executables:
28
28
  - highscore
@@ -83,7 +83,7 @@ rubyforge_project: highscore
83
83
  rubygems_version: 1.8.16
84
84
  signing_key:
85
85
  specification_version: 3
86
- summary: Find and rank keywords in long texts.
86
+ summary: Easily find and rank keywords in long texts.
87
87
  test_files:
88
88
  - test/highscore/test_blacklist.rb
89
89
  - test/highscore/test_content.rb