text 0.2.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -2,35 +2,54 @@
2
2
 
3
3
  A collection of text algorithms.
4
4
 
5
- = Usage
5
+ == Usage
6
6
 
7
7
  require 'text'
8
8
 
9
- font = Text::Figlet::Font.new('big.flf')
10
- figlet = Text::Figlet::Typesetter.new(font)
11
- figlet['Hello World'] # => '...'
9
+ === Levenshtein distance
12
10
 
13
- Text::Levenshtein.distance('test', 'test') # => 0
14
- Text::Levenshtein.distance('test', 'tent') # => 1
11
+ Text::Levenshtein.distance('test', 'test')
12
+ # => 0
13
+ Text::Levenshtein.distance('test', 'tent')
14
+ # => 1
15
15
 
16
- Text::Metaphone.metaphone('BRIAN') # => 'BRN'
17
- Text::Metaphone.double_metaphone('Coburn') # => ['KPRN', nil]
18
- Text::Metaphone.double_metaphone('Angier') # => ['ANJ', 'ANJR']
16
+ === Metaphone
19
17
 
20
- Text::Soundex.soundex('Knuth') # => 'K530'
18
+ Text::Metaphone.metaphone('BRIAN')
19
+ # => 'BRN'
21
20
 
22
- Text::PorterStemming.stem('abatements') # => 'abat'
21
+ Text::Metaphone.double_metaphone('Coburn')
22
+ # => ['KPRN', nil]
23
+ Text::Metaphone.double_metaphone('Angier')
24
+ # => ['ANJ', 'ANJR']
23
25
 
24
- = Ruby 1.9 Compatibility
26
+ === Soundex
25
27
 
26
- Most parts of the library are now compatible including
27
- tests. The big exception are the Figlet libraries which
28
- allow you to quickly do text-rendering in ASCII.
29
- On 1.9, Figlet isn't loaded and isn't tested.
28
+ Text::Soundex.soundex('Knuth')
29
+ # => 'K530'
30
30
 
31
- Work to integrate in Ruby 1.9 compatibility was done by
32
- Hampton Catlin (hcatlin)
31
+ === Porter stemming
33
32
 
34
- = License
33
+ Text::PorterStemming.stem('abatements') # => 'abat'
34
+
35
+ === White similarity
36
+
37
+ white = Text::WhiteSimilarity.new
38
+ white.similarity('Healed', 'Sealed') # 0.8
39
+ white.similarity('Healed', 'Help') # 0.25
40
+
41
+ Note that some intermediate information is cached on the instance to improve
42
+ performance.
43
+
44
+ == Ruby version compatibility
45
+
46
+ The library has been tested on Ruby 1.8.6 to 1.9.3 and on JRuby.
47
+
48
+ == Thanks
49
+
50
+ * Hampton Catlin (hcatlin) for Ruby 1.9 compatibility work
51
+ * Wilker Lúcio for the initial implementation of the White algorithm
52
+
53
+ == License
35
54
 
36
55
  Same as Ruby.
data/Rakefile CHANGED
@@ -1,48 +1,8 @@
1
1
  require 'rake'
2
2
  require 'rake/testtask'
3
- require 'rake/packagetask'
4
- require 'rake/gempackagetask'
5
- require 'rcov/rcovtask'
6
- require 'rake/rdoctask'
7
-
8
- $:.unshift(File.dirname(__FILE__) + '/lib')
9
- require 'text/version'
10
-
11
- gemspec = Gem::Specification.new do |s|
12
- s.name = 'text'
13
- s.version = Text::VERSION::STRING
14
- s.summary = 'A collection of text algorithms'
15
- s.description = 'A collection of text algorithms: Levenshtein, Soundex, Metaphone, Double Metaphone, Figlet, Porter Stemming'
16
- s.files = FileList['{lib,test}/**/*', 'README.rdoc', 'Rakefile']
17
- s.require_path = 'lib'
18
- s.has_rdoc = true
19
- s.extra_rdoc_files = %w[README.rdoc]
20
- s.rubyforge_project = 'text'
21
- s.homepage = 'http://github.com/threedaymonk/text'
22
- s.authors = ['Paul Battley', 'Michael Neumann', 'Tim Fletcher']
23
- s.email = "pbattley@gmail.com"
24
- end
25
-
26
- Rake::GemPackageTask.new(gemspec) do |pkg|
27
- pkg.need_tar_gz = true
28
- end
29
-
30
- Rake::PackageTask.new(gemspec.name, gemspec.version) do |pkg|
31
- pkg.need_tar_gz = true
32
- pkg.package_files.include gemspec.files
33
- end
34
3
 
35
4
  Rake::TestTask.new do |t|
36
5
  t.verbose = false
37
6
  end
38
7
 
39
- Rcov::RcovTask.new do |t|
40
- t.rcov_opts = []
41
- end
42
-
43
- Rake::RDocTask.new do |t|
44
- t.main = 'README.rdoc'
45
- t.rdoc_files.include 'README.rdoc', 'lib/**/*.rb'
46
- end
47
-
48
8
  task :default => :test
data/lib/text/version.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  module Text
2
2
  module VERSION #:nodoc:
3
- MAJOR = 0
4
- MINOR = 2
3
+ MAJOR = 1
4
+ MINOR = 0
5
5
  TINY = 0
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+ # Original author: Wilker Lúcio <wilkerlucio@gmail.com>
3
+
4
+ require "set"
5
+
6
+ module Text
7
+
8
+ # Ruby implementation of the string similarity described by Simon White
9
+ # at: http://www.catalysoft.com/articles/StrikeAMatch.html
10
+ #
11
+ # 2 * |pairs(s1) INTERSECT pairs(s2)|
12
+ # similarity(s1, s2) = -----------------------------------
13
+ # |pairs(s1)| + |pairs(s2)|
14
+ #
15
+ # e.g.
16
+ # 2 * |{FR, NC}|
17
+ # similarity(FRANCE, FRENCH) = ---------------------------------------
18
+ # |{FR,RA,AN,NC,CE}| + |{FR,RE,EN,NC,CH}|
19
+ #
20
+ # = (2 * 2) / (5 + 5)
21
+ #
22
+ # = 0.4
23
+ #
24
+ # WhiteSimilarity.new.similarity("FRANCE", "FRENCH")
25
+ #
26
+ class WhiteSimilarity
27
+
28
+ def self.similarity(str1, str2)
29
+ new.similarity(str1, str2)
30
+ end
31
+
32
+ def initialize
33
+ @word_letter_pairs = {}
34
+ end
35
+
36
+ def similarity(str1, str2)
37
+ pairs1 = word_letter_pairs(str1)
38
+ pairs2 = word_letter_pairs(str2)
39
+
40
+ intersection = pairs1.inject(0) { |acc, pair|
41
+ pairs2.include?(pair) ? acc + 1 : acc
42
+ }
43
+ union = pairs1.length + pairs2.length
44
+
45
+ (2.0 * intersection) / union
46
+ end
47
+
48
+ private
49
+ def word_letter_pairs(str)
50
+ @word_letter_pairs[str] ||= Set.new(
51
+ str.upcase.split(/\s+/).map{ |word|
52
+ (0 ... (word.length - 1)).map { |i| str[i, 2] }
53
+ }.flatten
54
+ )
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,29 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+ require "text/white_similarity"
3
+
4
+ class WhiteSimilarityTest < Test::Unit::TestCase
5
+
6
+ def test_similarity
7
+ word = "Healed"
8
+
9
+ assert_in_delta 0.8, Text::WhiteSimilarity.similarity(word, "Sealed"), 0.01
10
+ assert_in_delta 0.55, Text::WhiteSimilarity.similarity(word, "Healthy"), 0.01
11
+ assert_in_delta 0.44, Text::WhiteSimilarity.similarity(word, "Heard"), 0.01
12
+ assert_in_delta 0.40, Text::WhiteSimilarity.similarity(word, "Herded"), 0.01
13
+ assert_in_delta 0.25, Text::WhiteSimilarity.similarity(word, "Help"), 0.01
14
+ assert_in_delta 0.0, Text::WhiteSimilarity.similarity(word, "Sold"), 0.01
15
+ end
16
+
17
+ def test_similarity_with_caching
18
+ word = "Healed"
19
+
20
+ white = Text::WhiteSimilarity.new
21
+
22
+ assert_in_delta 0.8, white.similarity(word, "Sealed"), 0.01
23
+ assert_in_delta 0.55, white.similarity(word, "Healthy"), 0.01
24
+ assert_in_delta 0.44, white.similarity(word, "Heard"), 0.01
25
+ assert_in_delta 0.40, white.similarity(word, "Herded"), 0.01
26
+ assert_in_delta 0.25, white.similarity(word, "Help"), 0.01
27
+ assert_in_delta 0.0, white.similarity(word, "Sold"), 0.01
28
+ end
29
+ end
metadata CHANGED
@@ -1,81 +1,75 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: text
3
- version: !ruby/object:Gem::Version
4
- version: 0.2.0
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
5
6
  platform: ruby
6
- authors:
7
+ authors:
7
8
  - Paul Battley
8
9
  - Michael Neumann
9
10
  - Tim Fletcher
10
11
  autorequire:
11
12
  bindir: bin
12
13
  cert_chain: []
13
-
14
- date: 2010-03-03 00:00:00 +00:00
15
- default_executable:
14
+ date: 2011-11-21 00:00:00.000000000Z
16
15
  dependencies: []
17
-
18
- description: "A collection of text algorithms: Levenshtein, Soundex, Metaphone, Double Metaphone, Figlet, Porter Stemming"
16
+ description: ! 'A collection of text algorithms: Levenshtein, Soundex, Metaphone,
17
+ Double Metaphone, Figlet, Porter Stemming'
19
18
  email: pbattley@gmail.com
20
19
  executables: []
21
-
22
20
  extensions: []
23
-
24
- extra_rdoc_files:
21
+ extra_rdoc_files:
25
22
  - README.rdoc
26
- files:
27
- - lib/text/double_metaphone.rb
23
+ files:
28
24
  - lib/text/levenshtein.rb
29
25
  - lib/text/metaphone.rb
30
- - lib/text/porter_stemming.rb
31
26
  - lib/text/soundex.rb
27
+ - lib/text/white_similarity.rb
28
+ - lib/text/double_metaphone.rb
29
+ - lib/text/porter_stemming.rb
32
30
  - lib/text/util.rb
33
31
  - lib/text/version.rb
34
32
  - lib/text.rb
35
- - test/data/big.flf
36
- - test/data/big.txt
33
+ - test/test_porter_stemming.rb
37
34
  - test/data/chunky.flf
38
- - test/data/chunky.txt
39
- - test/data/double_metaphone.csv
40
- - test/data/metaphone.txt
41
- - test/data/metaphone_buggy.txt
42
35
  - test/data/porter_stemming_input.txt
36
+ - test/data/metaphone.txt
37
+ - test/data/double_metaphone.csv
38
+ - test/data/big.flf
43
39
  - test/data/porter_stemming_output.txt
40
+ - test/data/metaphone_buggy.txt
41
+ - test/data/chunky.txt
42
+ - test/data/big.txt
44
43
  - test/preamble.rb
45
- - test/test_double_metaphone.rb
46
- - test/test_levenshtein.rb
47
- - test/test_metaphone.rb
48
- - test/test_porter_stemming.rb
49
44
  - test/test_soundex.rb
45
+ - test/test_white_similarity.rb
46
+ - test/test_metaphone.rb
47
+ - test/test_levenshtein.rb
48
+ - test/test_double_metaphone.rb
50
49
  - README.rdoc
51
50
  - Rakefile
52
- has_rdoc: true
53
51
  homepage: http://github.com/threedaymonk/text
54
52
  licenses: []
55
-
56
53
  post_install_message:
57
54
  rdoc_options: []
58
-
59
- require_paths:
55
+ require_paths:
60
56
  - lib
61
- required_ruby_version: !ruby/object:Gem::Requirement
62
- requirements:
63
- - - ">="
64
- - !ruby/object:Gem::Version
65
- version: "0"
66
- version:
67
- required_rubygems_version: !ruby/object:Gem::Requirement
68
- requirements:
69
- - - ">="
70
- - !ruby/object:Gem::Version
71
- version: "0"
72
- version:
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
73
69
  requirements: []
74
-
75
70
  rubyforge_project: text
76
- rubygems_version: 1.3.5
71
+ rubygems_version: 1.8.11
77
72
  signing_key:
78
73
  specification_version: 3
79
74
  summary: A collection of text algorithms
80
75
  test_files: []
81
-