text 0.2.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +38 -19
- data/Rakefile +0 -40
- data/lib/text/version.rb +2 -2
- data/lib/text/white_similarity.rb +57 -0
- data/test/test_white_similarity.rb +29 -0
- metadata +38 -44
data/README.rdoc
CHANGED
@@ -2,35 +2,54 @@
|
|
2
2
|
|
3
3
|
A collection of text algorithms.
|
4
4
|
|
5
|
-
|
5
|
+
== Usage
|
6
6
|
|
7
7
|
require 'text'
|
8
8
|
|
9
|
-
|
10
|
-
figlet = Text::Figlet::Typesetter.new(font)
|
11
|
-
figlet['Hello World'] # => '...'
|
9
|
+
=== Levenshtein distance
|
12
10
|
|
13
|
-
Text::Levenshtein.distance('test', 'test')
|
14
|
-
|
11
|
+
Text::Levenshtein.distance('test', 'test')
|
12
|
+
# => 0
|
13
|
+
Text::Levenshtein.distance('test', 'tent')
|
14
|
+
# => 1
|
15
15
|
|
16
|
-
|
17
|
-
Text::Metaphone.double_metaphone('Coburn') # => ['KPRN', nil]
|
18
|
-
Text::Metaphone.double_metaphone('Angier') # => ['ANJ', 'ANJR']
|
16
|
+
=== Metaphone
|
19
17
|
|
20
|
-
Text::
|
18
|
+
Text::Metaphone.metaphone('BRIAN')
|
19
|
+
# => 'BRN'
|
21
20
|
|
22
|
-
Text::
|
21
|
+
Text::Metaphone.double_metaphone('Coburn')
|
22
|
+
# => ['KPRN', nil]
|
23
|
+
Text::Metaphone.double_metaphone('Angier')
|
24
|
+
# => ['ANJ', 'ANJR']
|
23
25
|
|
24
|
-
|
26
|
+
=== Soundex
|
25
27
|
|
26
|
-
|
27
|
-
|
28
|
-
allow you to quickly do text-rendering in ASCII.
|
29
|
-
On 1.9, Figlet isn't loaded and isn't tested.
|
28
|
+
Text::Soundex.soundex('Knuth')
|
29
|
+
# => 'K530'
|
30
30
|
|
31
|
-
|
32
|
-
Hampton Catlin (hcatlin)
|
31
|
+
=== Porter stemming
|
33
32
|
|
34
|
-
|
33
|
+
Text::PorterStemming.stem('abatements') # => 'abat'
|
34
|
+
|
35
|
+
=== White similarity
|
36
|
+
|
37
|
+
white = Text::WhiteSimilarity.new
|
38
|
+
white.similarity('Healed', 'Sealed') # 0.8
|
39
|
+
white.similarity('Healed', 'Help') # 0.25
|
40
|
+
|
41
|
+
Note that some intermediate information is cached on the instance to improve
|
42
|
+
performance.
|
43
|
+
|
44
|
+
== Ruby version compatibility
|
45
|
+
|
46
|
+
The library has been tested on Ruby 1.8.6 to 1.9.3 and on JRuby.
|
47
|
+
|
48
|
+
== Thanks
|
49
|
+
|
50
|
+
* Hampton Catlin (hcatlin) for Ruby 1.9 compatibility work
|
51
|
+
* Wilker Lúcio for the initial implementation of the White algorithm
|
52
|
+
|
53
|
+
== License
|
35
54
|
|
36
55
|
Same as Ruby.
|
data/Rakefile
CHANGED
@@ -1,48 +1,8 @@
|
|
1
1
|
require 'rake'
|
2
2
|
require 'rake/testtask'
|
3
|
-
require 'rake/packagetask'
|
4
|
-
require 'rake/gempackagetask'
|
5
|
-
require 'rcov/rcovtask'
|
6
|
-
require 'rake/rdoctask'
|
7
|
-
|
8
|
-
$:.unshift(File.dirname(__FILE__) + '/lib')
|
9
|
-
require 'text/version'
|
10
|
-
|
11
|
-
gemspec = Gem::Specification.new do |s|
|
12
|
-
s.name = 'text'
|
13
|
-
s.version = Text::VERSION::STRING
|
14
|
-
s.summary = 'A collection of text algorithms'
|
15
|
-
s.description = 'A collection of text algorithms: Levenshtein, Soundex, Metaphone, Double Metaphone, Figlet, Porter Stemming'
|
16
|
-
s.files = FileList['{lib,test}/**/*', 'README.rdoc', 'Rakefile']
|
17
|
-
s.require_path = 'lib'
|
18
|
-
s.has_rdoc = true
|
19
|
-
s.extra_rdoc_files = %w[README.rdoc]
|
20
|
-
s.rubyforge_project = 'text'
|
21
|
-
s.homepage = 'http://github.com/threedaymonk/text'
|
22
|
-
s.authors = ['Paul Battley', 'Michael Neumann', 'Tim Fletcher']
|
23
|
-
s.email = "pbattley@gmail.com"
|
24
|
-
end
|
25
|
-
|
26
|
-
Rake::GemPackageTask.new(gemspec) do |pkg|
|
27
|
-
pkg.need_tar_gz = true
|
28
|
-
end
|
29
|
-
|
30
|
-
Rake::PackageTask.new(gemspec.name, gemspec.version) do |pkg|
|
31
|
-
pkg.need_tar_gz = true
|
32
|
-
pkg.package_files.include gemspec.files
|
33
|
-
end
|
34
3
|
|
35
4
|
Rake::TestTask.new do |t|
|
36
5
|
t.verbose = false
|
37
6
|
end
|
38
7
|
|
39
|
-
Rcov::RcovTask.new do |t|
|
40
|
-
t.rcov_opts = []
|
41
|
-
end
|
42
|
-
|
43
|
-
Rake::RDocTask.new do |t|
|
44
|
-
t.main = 'README.rdoc'
|
45
|
-
t.rdoc_files.include 'README.rdoc', 'lib/**/*.rb'
|
46
|
-
end
|
47
|
-
|
48
8
|
task :default => :test
|
data/lib/text/version.rb
CHANGED
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# Original author: Wilker Lúcio <wilkerlucio@gmail.com>
|
3
|
+
|
4
|
+
require "set"
|
5
|
+
|
6
|
+
module Text
|
7
|
+
|
8
|
+
# Ruby implementation of the string similarity described by Simon White
|
9
|
+
# at: http://www.catalysoft.com/articles/StrikeAMatch.html
|
10
|
+
#
|
11
|
+
# 2 * |pairs(s1) INTERSECT pairs(s2)|
|
12
|
+
# similarity(s1, s2) = -----------------------------------
|
13
|
+
# |pairs(s1)| + |pairs(s2)|
|
14
|
+
#
|
15
|
+
# e.g.
|
16
|
+
# 2 * |{FR, NC}|
|
17
|
+
# similarity(FRANCE, FRENCH) = ---------------------------------------
|
18
|
+
# |{FR,RA,AN,NC,CE}| + |{FR,RE,EN,NC,CH}|
|
19
|
+
#
|
20
|
+
# = (2 * 2) / (5 + 5)
|
21
|
+
#
|
22
|
+
# = 0.4
|
23
|
+
#
|
24
|
+
# WhiteSimilarity.new.similarity("FRANCE", "FRENCH")
|
25
|
+
#
|
26
|
+
class WhiteSimilarity
|
27
|
+
|
28
|
+
def self.similarity(str1, str2)
|
29
|
+
new.similarity(str1, str2)
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize
|
33
|
+
@word_letter_pairs = {}
|
34
|
+
end
|
35
|
+
|
36
|
+
def similarity(str1, str2)
|
37
|
+
pairs1 = word_letter_pairs(str1)
|
38
|
+
pairs2 = word_letter_pairs(str2)
|
39
|
+
|
40
|
+
intersection = pairs1.inject(0) { |acc, pair|
|
41
|
+
pairs2.include?(pair) ? acc + 1 : acc
|
42
|
+
}
|
43
|
+
union = pairs1.length + pairs2.length
|
44
|
+
|
45
|
+
(2.0 * intersection) / union
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
def word_letter_pairs(str)
|
50
|
+
@word_letter_pairs[str] ||= Set.new(
|
51
|
+
str.upcase.split(/\s+/).map{ |word|
|
52
|
+
(0 ... (word.length - 1)).map { |i| str[i, 2] }
|
53
|
+
}.flatten
|
54
|
+
)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
2
|
+
require "text/white_similarity"
|
3
|
+
|
4
|
+
class WhiteSimilarityTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_similarity
|
7
|
+
word = "Healed"
|
8
|
+
|
9
|
+
assert_in_delta 0.8, Text::WhiteSimilarity.similarity(word, "Sealed"), 0.01
|
10
|
+
assert_in_delta 0.55, Text::WhiteSimilarity.similarity(word, "Healthy"), 0.01
|
11
|
+
assert_in_delta 0.44, Text::WhiteSimilarity.similarity(word, "Heard"), 0.01
|
12
|
+
assert_in_delta 0.40, Text::WhiteSimilarity.similarity(word, "Herded"), 0.01
|
13
|
+
assert_in_delta 0.25, Text::WhiteSimilarity.similarity(word, "Help"), 0.01
|
14
|
+
assert_in_delta 0.0, Text::WhiteSimilarity.similarity(word, "Sold"), 0.01
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_similarity_with_caching
|
18
|
+
word = "Healed"
|
19
|
+
|
20
|
+
white = Text::WhiteSimilarity.new
|
21
|
+
|
22
|
+
assert_in_delta 0.8, white.similarity(word, "Sealed"), 0.01
|
23
|
+
assert_in_delta 0.55, white.similarity(word, "Healthy"), 0.01
|
24
|
+
assert_in_delta 0.44, white.similarity(word, "Heard"), 0.01
|
25
|
+
assert_in_delta 0.40, white.similarity(word, "Herded"), 0.01
|
26
|
+
assert_in_delta 0.25, white.similarity(word, "Help"), 0.01
|
27
|
+
assert_in_delta 0.0, white.similarity(word, "Sold"), 0.01
|
28
|
+
end
|
29
|
+
end
|
metadata
CHANGED
@@ -1,81 +1,75 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: text
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
|
-
authors:
|
7
|
+
authors:
|
7
8
|
- Paul Battley
|
8
9
|
- Michael Neumann
|
9
10
|
- Tim Fletcher
|
10
11
|
autorequire:
|
11
12
|
bindir: bin
|
12
13
|
cert_chain: []
|
13
|
-
|
14
|
-
date: 2010-03-03 00:00:00 +00:00
|
15
|
-
default_executable:
|
14
|
+
date: 2011-11-21 00:00:00.000000000Z
|
16
15
|
dependencies: []
|
17
|
-
|
18
|
-
|
16
|
+
description: ! 'A collection of text algorithms: Levenshtein, Soundex, Metaphone,
|
17
|
+
Double Metaphone, Figlet, Porter Stemming'
|
19
18
|
email: pbattley@gmail.com
|
20
19
|
executables: []
|
21
|
-
|
22
20
|
extensions: []
|
23
|
-
|
24
|
-
extra_rdoc_files:
|
21
|
+
extra_rdoc_files:
|
25
22
|
- README.rdoc
|
26
|
-
files:
|
27
|
-
- lib/text/double_metaphone.rb
|
23
|
+
files:
|
28
24
|
- lib/text/levenshtein.rb
|
29
25
|
- lib/text/metaphone.rb
|
30
|
-
- lib/text/porter_stemming.rb
|
31
26
|
- lib/text/soundex.rb
|
27
|
+
- lib/text/white_similarity.rb
|
28
|
+
- lib/text/double_metaphone.rb
|
29
|
+
- lib/text/porter_stemming.rb
|
32
30
|
- lib/text/util.rb
|
33
31
|
- lib/text/version.rb
|
34
32
|
- lib/text.rb
|
35
|
-
- test/
|
36
|
-
- test/data/big.txt
|
33
|
+
- test/test_porter_stemming.rb
|
37
34
|
- test/data/chunky.flf
|
38
|
-
- test/data/chunky.txt
|
39
|
-
- test/data/double_metaphone.csv
|
40
|
-
- test/data/metaphone.txt
|
41
|
-
- test/data/metaphone_buggy.txt
|
42
35
|
- test/data/porter_stemming_input.txt
|
36
|
+
- test/data/metaphone.txt
|
37
|
+
- test/data/double_metaphone.csv
|
38
|
+
- test/data/big.flf
|
43
39
|
- test/data/porter_stemming_output.txt
|
40
|
+
- test/data/metaphone_buggy.txt
|
41
|
+
- test/data/chunky.txt
|
42
|
+
- test/data/big.txt
|
44
43
|
- test/preamble.rb
|
45
|
-
- test/test_double_metaphone.rb
|
46
|
-
- test/test_levenshtein.rb
|
47
|
-
- test/test_metaphone.rb
|
48
|
-
- test/test_porter_stemming.rb
|
49
44
|
- test/test_soundex.rb
|
45
|
+
- test/test_white_similarity.rb
|
46
|
+
- test/test_metaphone.rb
|
47
|
+
- test/test_levenshtein.rb
|
48
|
+
- test/test_double_metaphone.rb
|
50
49
|
- README.rdoc
|
51
50
|
- Rakefile
|
52
|
-
has_rdoc: true
|
53
51
|
homepage: http://github.com/threedaymonk/text
|
54
52
|
licenses: []
|
55
|
-
|
56
53
|
post_install_message:
|
57
54
|
rdoc_options: []
|
58
|
-
|
59
|
-
require_paths:
|
55
|
+
require_paths:
|
60
56
|
- lib
|
61
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
73
69
|
requirements: []
|
74
|
-
|
75
70
|
rubyforge_project: text
|
76
|
-
rubygems_version: 1.
|
71
|
+
rubygems_version: 1.8.11
|
77
72
|
signing_key:
|
78
73
|
specification_version: 3
|
79
74
|
summary: A collection of text algorithms
|
80
75
|
test_files: []
|
81
|
-
|