text 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -2,35 +2,54 @@
2
2
 
3
3
  A collection of text algorithms.
4
4
 
5
- = Usage
5
+ == Usage
6
6
 
7
7
  require 'text'
8
8
 
9
- font = Text::Figlet::Font.new('big.flf')
10
- figlet = Text::Figlet::Typesetter.new(font)
11
- figlet['Hello World'] # => '...'
9
+ === Levenshtein distance
12
10
 
13
- Text::Levenshtein.distance('test', 'test') # => 0
14
- Text::Levenshtein.distance('test', 'tent') # => 1
11
+ Text::Levenshtein.distance('test', 'test')
12
+ # => 0
13
+ Text::Levenshtein.distance('test', 'tent')
14
+ # => 1
15
15
 
16
- Text::Metaphone.metaphone('BRIAN') # => 'BRN'
17
- Text::Metaphone.double_metaphone('Coburn') # => ['KPRN', nil]
18
- Text::Metaphone.double_metaphone('Angier') # => ['ANJ', 'ANJR']
16
+ === Metaphone
19
17
 
20
- Text::Soundex.soundex('Knuth') # => 'K530'
18
+ Text::Metaphone.metaphone('BRIAN')
19
+ # => 'BRN'
21
20
 
22
- Text::PorterStemming.stem('abatements') # => 'abat'
21
+ Text::Metaphone.double_metaphone('Coburn')
22
+ # => ['KPRN', nil]
23
+ Text::Metaphone.double_metaphone('Angier')
24
+ # => ['ANJ', 'ANJR']
23
25
 
24
- = Ruby 1.9 Compatibility
26
+ === Soundex
25
27
 
26
- Most parts of the library are now compatible including
27
- tests. The big exception are the Figlet libraries which
28
- allow you to quickly do text-rendering in ASCII.
29
- On 1.9, Figlet isn't loaded and isn't tested.
28
+ Text::Soundex.soundex('Knuth')
29
+ # => 'K530'
30
30
 
31
- Work to integrate in Ruby 1.9 compatibility was done by
32
- Hampton Catlin (hcatlin)
31
+ === Porter stemming
33
32
 
34
- = License
33
+ Text::PorterStemming.stem('abatements') # => 'abat'
34
+
35
+ === White similarity
36
+
37
+ white = Text::WhiteSimilarity.new
38
+ white.similarity('Healed', 'Sealed') # 0.8
39
+ white.similarity('Healed', 'Help') # 0.25
40
+
41
+ Note that some intermediate information is cached on the instance to improve
42
+ performance.
43
+
44
+ == Ruby version compatibility
45
+
46
+ The library has been tested on Ruby 1.8.6 to 1.9.3 and on JRuby.
47
+
48
+ == Thanks
49
+
50
+ * Hampton Catlin (hcatlin) for Ruby 1.9 compatibility work
51
+ * Wilker Lúcio for the initial implementation of the White algorithm
52
+
53
+ == License
35
54
 
36
55
  Same as Ruby.
data/Rakefile CHANGED
@@ -1,48 +1,8 @@
1
1
  require 'rake'
2
2
  require 'rake/testtask'
3
- require 'rake/packagetask'
4
- require 'rake/gempackagetask'
5
- require 'rcov/rcovtask'
6
- require 'rake/rdoctask'
7
-
8
- $:.unshift(File.dirname(__FILE__) + '/lib')
9
- require 'text/version'
10
-
11
- gemspec = Gem::Specification.new do |s|
12
- s.name = 'text'
13
- s.version = Text::VERSION::STRING
14
- s.summary = 'A collection of text algorithms'
15
- s.description = 'A collection of text algorithms: Levenshtein, Soundex, Metaphone, Double Metaphone, Figlet, Porter Stemming'
16
- s.files = FileList['{lib,test}/**/*', 'README.rdoc', 'Rakefile']
17
- s.require_path = 'lib'
18
- s.has_rdoc = true
19
- s.extra_rdoc_files = %w[README.rdoc]
20
- s.rubyforge_project = 'text'
21
- s.homepage = 'http://github.com/threedaymonk/text'
22
- s.authors = ['Paul Battley', 'Michael Neumann', 'Tim Fletcher']
23
- s.email = "pbattley@gmail.com"
24
- end
25
-
26
- Rake::GemPackageTask.new(gemspec) do |pkg|
27
- pkg.need_tar_gz = true
28
- end
29
-
30
- Rake::PackageTask.new(gemspec.name, gemspec.version) do |pkg|
31
- pkg.need_tar_gz = true
32
- pkg.package_files.include gemspec.files
33
- end
34
3
 
35
4
  Rake::TestTask.new do |t|
36
5
  t.verbose = false
37
6
  end
38
7
 
39
- Rcov::RcovTask.new do |t|
40
- t.rcov_opts = []
41
- end
42
-
43
- Rake::RDocTask.new do |t|
44
- t.main = 'README.rdoc'
45
- t.rdoc_files.include 'README.rdoc', 'lib/**/*.rb'
46
- end
47
-
48
8
  task :default => :test
data/lib/text/version.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module Text
2
2
  module VERSION #:nodoc:
3
- MAJOR = 0
4
- MINOR = 2
3
+ MAJOR = 1
4
+ MINOR = 0
5
5
  TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+ # Original author: Wilker Lúcio <wilkerlucio@gmail.com>
3
+
4
+ require "set"
5
+
6
+ module Text
7
+
8
+ # Ruby implementation of the string similarity described by Simon White
9
+ # at: http://www.catalysoft.com/articles/StrikeAMatch.html
10
+ #
11
+ # 2 * |pairs(s1) INTERSECT pairs(s2)|
12
+ # similarity(s1, s2) = -----------------------------------
13
+ # |pairs(s1)| + |pairs(s2)|
14
+ #
15
+ # e.g.
16
+ # 2 * |{FR, NC}|
17
+ # similarity(FRANCE, FRENCH) = ---------------------------------------
18
+ # |{FR,RA,AN,NC,CE}| + |{FR,RE,EN,NC,CH}|
19
+ #
20
+ # = (2 * 2) / (5 + 5)
21
+ #
22
+ # = 0.4
23
+ #
24
+ # WhiteSimilarity.new.similarity("FRANCE", "FRENCH")
25
+ #
26
+ class WhiteSimilarity
27
+
28
+ def self.similarity(str1, str2)
29
+ new.similarity(str1, str2)
30
+ end
31
+
32
+ def initialize
33
+ @word_letter_pairs = {}
34
+ end
35
+
36
+ def similarity(str1, str2)
37
+ pairs1 = word_letter_pairs(str1)
38
+ pairs2 = word_letter_pairs(str2)
39
+
40
+ intersection = pairs1.inject(0) { |acc, pair|
41
+ pairs2.include?(pair) ? acc + 1 : acc
42
+ }
43
+ union = pairs1.length + pairs2.length
44
+
45
+ (2.0 * intersection) / union
46
+ end
47
+
48
+ private
49
+ def word_letter_pairs(str)
50
+ @word_letter_pairs[str] ||= Set.new(
51
+ str.upcase.split(/\s+/).map{ |word|
52
+ (0 ... (word.length - 1)).map { |i| str[i, 2] }
53
+ }.flatten
54
+ )
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,29 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+ require "text/white_similarity"
3
+
4
+ class WhiteSimilarityTest < Test::Unit::TestCase
5
+
6
+ def test_similarity
7
+ word = "Healed"
8
+
9
+ assert_in_delta 0.8, Text::WhiteSimilarity.similarity(word, "Sealed"), 0.01
10
+ assert_in_delta 0.55, Text::WhiteSimilarity.similarity(word, "Healthy"), 0.01
11
+ assert_in_delta 0.44, Text::WhiteSimilarity.similarity(word, "Heard"), 0.01
12
+ assert_in_delta 0.40, Text::WhiteSimilarity.similarity(word, "Herded"), 0.01
13
+ assert_in_delta 0.25, Text::WhiteSimilarity.similarity(word, "Help"), 0.01
14
+ assert_in_delta 0.0, Text::WhiteSimilarity.similarity(word, "Sold"), 0.01
15
+ end
16
+
17
+ def test_similarity_with_caching
18
+ word = "Healed"
19
+
20
+ white = Text::WhiteSimilarity.new
21
+
22
+ assert_in_delta 0.8, white.similarity(word, "Sealed"), 0.01
23
+ assert_in_delta 0.55, white.similarity(word, "Healthy"), 0.01
24
+ assert_in_delta 0.44, white.similarity(word, "Heard"), 0.01
25
+ assert_in_delta 0.40, white.similarity(word, "Herded"), 0.01
26
+ assert_in_delta 0.25, white.similarity(word, "Help"), 0.01
27
+ assert_in_delta 0.0, white.similarity(word, "Sold"), 0.01
28
+ end
29
+ end
metadata CHANGED
@@ -1,81 +1,75 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: text
3
- version: !ruby/object:Gem::Version
4
- version: 0.2.0
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
5
6
  platform: ruby
6
- authors:
7
+ authors:
7
8
  - Paul Battley
8
9
  - Michael Neumann
9
10
  - Tim Fletcher
10
11
  autorequire:
11
12
  bindir: bin
12
13
  cert_chain: []
13
-
14
- date: 2010-03-03 00:00:00 +00:00
15
- default_executable:
14
+ date: 2011-11-21 00:00:00.000000000Z
16
15
  dependencies: []
17
-
18
- description: "A collection of text algorithms: Levenshtein, Soundex, Metaphone, Double Metaphone, Figlet, Porter Stemming"
16
+ description: ! 'A collection of text algorithms: Levenshtein, Soundex, Metaphone,
17
+ Double Metaphone, Figlet, Porter Stemming'
19
18
  email: pbattley@gmail.com
20
19
  executables: []
21
-
22
20
  extensions: []
23
-
24
- extra_rdoc_files:
21
+ extra_rdoc_files:
25
22
  - README.rdoc
26
- files:
27
- - lib/text/double_metaphone.rb
23
+ files:
28
24
  - lib/text/levenshtein.rb
29
25
  - lib/text/metaphone.rb
30
- - lib/text/porter_stemming.rb
31
26
  - lib/text/soundex.rb
27
+ - lib/text/white_similarity.rb
28
+ - lib/text/double_metaphone.rb
29
+ - lib/text/porter_stemming.rb
32
30
  - lib/text/util.rb
33
31
  - lib/text/version.rb
34
32
  - lib/text.rb
35
- - test/data/big.flf
36
- - test/data/big.txt
33
+ - test/test_porter_stemming.rb
37
34
  - test/data/chunky.flf
38
- - test/data/chunky.txt
39
- - test/data/double_metaphone.csv
40
- - test/data/metaphone.txt
41
- - test/data/metaphone_buggy.txt
42
35
  - test/data/porter_stemming_input.txt
36
+ - test/data/metaphone.txt
37
+ - test/data/double_metaphone.csv
38
+ - test/data/big.flf
43
39
  - test/data/porter_stemming_output.txt
40
+ - test/data/metaphone_buggy.txt
41
+ - test/data/chunky.txt
42
+ - test/data/big.txt
44
43
  - test/preamble.rb
45
- - test/test_double_metaphone.rb
46
- - test/test_levenshtein.rb
47
- - test/test_metaphone.rb
48
- - test/test_porter_stemming.rb
49
44
  - test/test_soundex.rb
45
+ - test/test_white_similarity.rb
46
+ - test/test_metaphone.rb
47
+ - test/test_levenshtein.rb
48
+ - test/test_double_metaphone.rb
50
49
  - README.rdoc
51
50
  - Rakefile
52
- has_rdoc: true
53
51
  homepage: http://github.com/threedaymonk/text
54
52
  licenses: []
55
-
56
53
  post_install_message:
57
54
  rdoc_options: []
58
-
59
- require_paths:
55
+ require_paths:
60
56
  - lib
61
- required_ruby_version: !ruby/object:Gem::Requirement
62
- requirements:
63
- - - ">="
64
- - !ruby/object:Gem::Version
65
- version: "0"
66
- version:
67
- required_rubygems_version: !ruby/object:Gem::Requirement
68
- requirements:
69
- - - ">="
70
- - !ruby/object:Gem::Version
71
- version: "0"
72
- version:
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
73
69
  requirements: []
74
-
75
70
  rubyforge_project: text
76
- rubygems_version: 1.3.5
71
+ rubygems_version: 1.8.11
77
72
  signing_key:
78
73
  specification_version: 3
79
74
  summary: A collection of text algorithms
80
75
  test_files: []
81
-