text 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +38 -19
- data/Rakefile +0 -40
- data/lib/text/version.rb +2 -2
- data/lib/text/white_similarity.rb +57 -0
- data/test/test_white_similarity.rb +29 -0
- metadata +38 -44
data/README.rdoc
CHANGED
@@ -2,35 +2,54 @@
|
|
2
2
|
|
3
3
|
A collection of text algorithms.
|
4
4
|
|
5
|
-
|
5
|
+
== Usage
|
6
6
|
|
7
7
|
require 'text'
|
8
8
|
|
9
|
-
|
10
|
-
figlet = Text::Figlet::Typesetter.new(font)
|
11
|
-
figlet['Hello World'] # => '...'
|
9
|
+
=== Levenshtein distance
|
12
10
|
|
13
|
-
Text::Levenshtein.distance('test', 'test')
|
14
|
-
|
11
|
+
Text::Levenshtein.distance('test', 'test')
|
12
|
+
# => 0
|
13
|
+
Text::Levenshtein.distance('test', 'tent')
|
14
|
+
# => 1
|
15
15
|
|
16
|
-
|
17
|
-
Text::Metaphone.double_metaphone('Coburn') # => ['KPRN', nil]
|
18
|
-
Text::Metaphone.double_metaphone('Angier') # => ['ANJ', 'ANJR']
|
16
|
+
=== Metaphone
|
19
17
|
|
20
|
-
Text::
|
18
|
+
Text::Metaphone.metaphone('BRIAN')
|
19
|
+
# => 'BRN'
|
21
20
|
|
22
|
-
Text::
|
21
|
+
Text::Metaphone.double_metaphone('Coburn')
|
22
|
+
# => ['KPRN', nil]
|
23
|
+
Text::Metaphone.double_metaphone('Angier')
|
24
|
+
# => ['ANJ', 'ANJR']
|
23
25
|
|
24
|
-
|
26
|
+
=== Soundex
|
25
27
|
|
26
|
-
|
27
|
-
|
28
|
-
allow you to quickly do text-rendering in ASCII.
|
29
|
-
On 1.9, Figlet isn't loaded and isn't tested.
|
28
|
+
Text::Soundex.soundex('Knuth')
|
29
|
+
# => 'K530'
|
30
30
|
|
31
|
-
|
32
|
-
Hampton Catlin (hcatlin)
|
31
|
+
=== Porter stemming
|
33
32
|
|
34
|
-
|
33
|
+
Text::PorterStemming.stem('abatements') # => 'abat'
|
34
|
+
|
35
|
+
=== White similarity
|
36
|
+
|
37
|
+
white = Text::WhiteSimilarity.new
|
38
|
+
white.similarity('Healed', 'Sealed') # 0.8
|
39
|
+
white.similarity('Healed', 'Help') # 0.25
|
40
|
+
|
41
|
+
Note that some intermediate information is cached on the instance to improve
|
42
|
+
performance.
|
43
|
+
|
44
|
+
== Ruby version compatibility
|
45
|
+
|
46
|
+
The library has been tested on Ruby 1.8.6 to 1.9.3 and on JRuby.
|
47
|
+
|
48
|
+
== Thanks
|
49
|
+
|
50
|
+
* Hampton Catlin (hcatlin) for Ruby 1.9 compatibility work
|
51
|
+
* Wilker Lúcio for the initial implementation of the White algorithm
|
52
|
+
|
53
|
+
== License
|
35
54
|
|
36
55
|
Same as Ruby.
|
data/Rakefile
CHANGED
@@ -1,48 +1,8 @@
|
|
1
1
|
require 'rake'
|
2
2
|
require 'rake/testtask'
|
3
|
-
require 'rake/packagetask'
|
4
|
-
require 'rake/gempackagetask'
|
5
|
-
require 'rcov/rcovtask'
|
6
|
-
require 'rake/rdoctask'
|
7
|
-
|
8
|
-
$:.unshift(File.dirname(__FILE__) + '/lib')
|
9
|
-
require 'text/version'
|
10
|
-
|
11
|
-
gemspec = Gem::Specification.new do |s|
|
12
|
-
s.name = 'text'
|
13
|
-
s.version = Text::VERSION::STRING
|
14
|
-
s.summary = 'A collection of text algorithms'
|
15
|
-
s.description = 'A collection of text algorithms: Levenshtein, Soundex, Metaphone, Double Metaphone, Figlet, Porter Stemming'
|
16
|
-
s.files = FileList['{lib,test}/**/*', 'README.rdoc', 'Rakefile']
|
17
|
-
s.require_path = 'lib'
|
18
|
-
s.has_rdoc = true
|
19
|
-
s.extra_rdoc_files = %w[README.rdoc]
|
20
|
-
s.rubyforge_project = 'text'
|
21
|
-
s.homepage = 'http://github.com/threedaymonk/text'
|
22
|
-
s.authors = ['Paul Battley', 'Michael Neumann', 'Tim Fletcher']
|
23
|
-
s.email = "pbattley@gmail.com"
|
24
|
-
end
|
25
|
-
|
26
|
-
Rake::GemPackageTask.new(gemspec) do |pkg|
|
27
|
-
pkg.need_tar_gz = true
|
28
|
-
end
|
29
|
-
|
30
|
-
Rake::PackageTask.new(gemspec.name, gemspec.version) do |pkg|
|
31
|
-
pkg.need_tar_gz = true
|
32
|
-
pkg.package_files.include gemspec.files
|
33
|
-
end
|
34
3
|
|
35
4
|
Rake::TestTask.new do |t|
|
36
5
|
t.verbose = false
|
37
6
|
end
|
38
7
|
|
39
|
-
Rcov::RcovTask.new do |t|
|
40
|
-
t.rcov_opts = []
|
41
|
-
end
|
42
|
-
|
43
|
-
Rake::RDocTask.new do |t|
|
44
|
-
t.main = 'README.rdoc'
|
45
|
-
t.rdoc_files.include 'README.rdoc', 'lib/**/*.rb'
|
46
|
-
end
|
47
|
-
|
48
8
|
task :default => :test
|
data/lib/text/version.rb
CHANGED
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# Original author: Wilker Lúcio <wilkerlucio@gmail.com>
|
3
|
+
|
4
|
+
require "set"
|
5
|
+
|
6
|
+
module Text
|
7
|
+
|
8
|
+
# Ruby implementation of the string similarity described by Simon White
|
9
|
+
# at: http://www.catalysoft.com/articles/StrikeAMatch.html
|
10
|
+
#
|
11
|
+
# 2 * |pairs(s1) INTERSECT pairs(s2)|
|
12
|
+
# similarity(s1, s2) = -----------------------------------
|
13
|
+
# |pairs(s1)| + |pairs(s2)|
|
14
|
+
#
|
15
|
+
# e.g.
|
16
|
+
# 2 * |{FR, NC}|
|
17
|
+
# similarity(FRANCE, FRENCH) = ---------------------------------------
|
18
|
+
# |{FR,RA,AN,NC,CE}| + |{FR,RE,EN,NC,CH}|
|
19
|
+
#
|
20
|
+
# = (2 * 2) / (5 + 5)
|
21
|
+
#
|
22
|
+
# = 0.4
|
23
|
+
#
|
24
|
+
# WhiteSimilarity.new.similarity("FRANCE", "FRENCH")
|
25
|
+
#
|
26
|
+
class WhiteSimilarity
|
27
|
+
|
28
|
+
def self.similarity(str1, str2)
|
29
|
+
new.similarity(str1, str2)
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize
|
33
|
+
@word_letter_pairs = {}
|
34
|
+
end
|
35
|
+
|
36
|
+
def similarity(str1, str2)
|
37
|
+
pairs1 = word_letter_pairs(str1)
|
38
|
+
pairs2 = word_letter_pairs(str2)
|
39
|
+
|
40
|
+
intersection = pairs1.inject(0) { |acc, pair|
|
41
|
+
pairs2.include?(pair) ? acc + 1 : acc
|
42
|
+
}
|
43
|
+
union = pairs1.length + pairs2.length
|
44
|
+
|
45
|
+
(2.0 * intersection) / union
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
def word_letter_pairs(str)
|
50
|
+
@word_letter_pairs[str] ||= Set.new(
|
51
|
+
str.upcase.split(/\s+/).map{ |word|
|
52
|
+
(0 ... (word.length - 1)).map { |i| str[i, 2] }
|
53
|
+
}.flatten
|
54
|
+
)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
2
|
+
require "text/white_similarity"
|
3
|
+
|
4
|
+
class WhiteSimilarityTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_similarity
|
7
|
+
word = "Healed"
|
8
|
+
|
9
|
+
assert_in_delta 0.8, Text::WhiteSimilarity.similarity(word, "Sealed"), 0.01
|
10
|
+
assert_in_delta 0.55, Text::WhiteSimilarity.similarity(word, "Healthy"), 0.01
|
11
|
+
assert_in_delta 0.44, Text::WhiteSimilarity.similarity(word, "Heard"), 0.01
|
12
|
+
assert_in_delta 0.40, Text::WhiteSimilarity.similarity(word, "Herded"), 0.01
|
13
|
+
assert_in_delta 0.25, Text::WhiteSimilarity.similarity(word, "Help"), 0.01
|
14
|
+
assert_in_delta 0.0, Text::WhiteSimilarity.similarity(word, "Sold"), 0.01
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_similarity_with_caching
|
18
|
+
word = "Healed"
|
19
|
+
|
20
|
+
white = Text::WhiteSimilarity.new
|
21
|
+
|
22
|
+
assert_in_delta 0.8, white.similarity(word, "Sealed"), 0.01
|
23
|
+
assert_in_delta 0.55, white.similarity(word, "Healthy"), 0.01
|
24
|
+
assert_in_delta 0.44, white.similarity(word, "Heard"), 0.01
|
25
|
+
assert_in_delta 0.40, white.similarity(word, "Herded"), 0.01
|
26
|
+
assert_in_delta 0.25, white.similarity(word, "Help"), 0.01
|
27
|
+
assert_in_delta 0.0, white.similarity(word, "Sold"), 0.01
|
28
|
+
end
|
29
|
+
end
|
metadata
CHANGED
@@ -1,81 +1,75 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: text
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
|
-
authors:
|
7
|
+
authors:
|
7
8
|
- Paul Battley
|
8
9
|
- Michael Neumann
|
9
10
|
- Tim Fletcher
|
10
11
|
autorequire:
|
11
12
|
bindir: bin
|
12
13
|
cert_chain: []
|
13
|
-
|
14
|
-
date: 2010-03-03 00:00:00 +00:00
|
15
|
-
default_executable:
|
14
|
+
date: 2011-11-21 00:00:00.000000000Z
|
16
15
|
dependencies: []
|
17
|
-
|
18
|
-
|
16
|
+
description: ! 'A collection of text algorithms: Levenshtein, Soundex, Metaphone,
|
17
|
+
Double Metaphone, Figlet, Porter Stemming'
|
19
18
|
email: pbattley@gmail.com
|
20
19
|
executables: []
|
21
|
-
|
22
20
|
extensions: []
|
23
|
-
|
24
|
-
extra_rdoc_files:
|
21
|
+
extra_rdoc_files:
|
25
22
|
- README.rdoc
|
26
|
-
files:
|
27
|
-
- lib/text/double_metaphone.rb
|
23
|
+
files:
|
28
24
|
- lib/text/levenshtein.rb
|
29
25
|
- lib/text/metaphone.rb
|
30
|
-
- lib/text/porter_stemming.rb
|
31
26
|
- lib/text/soundex.rb
|
27
|
+
- lib/text/white_similarity.rb
|
28
|
+
- lib/text/double_metaphone.rb
|
29
|
+
- lib/text/porter_stemming.rb
|
32
30
|
- lib/text/util.rb
|
33
31
|
- lib/text/version.rb
|
34
32
|
- lib/text.rb
|
35
|
-
- test/
|
36
|
-
- test/data/big.txt
|
33
|
+
- test/test_porter_stemming.rb
|
37
34
|
- test/data/chunky.flf
|
38
|
-
- test/data/chunky.txt
|
39
|
-
- test/data/double_metaphone.csv
|
40
|
-
- test/data/metaphone.txt
|
41
|
-
- test/data/metaphone_buggy.txt
|
42
35
|
- test/data/porter_stemming_input.txt
|
36
|
+
- test/data/metaphone.txt
|
37
|
+
- test/data/double_metaphone.csv
|
38
|
+
- test/data/big.flf
|
43
39
|
- test/data/porter_stemming_output.txt
|
40
|
+
- test/data/metaphone_buggy.txt
|
41
|
+
- test/data/chunky.txt
|
42
|
+
- test/data/big.txt
|
44
43
|
- test/preamble.rb
|
45
|
-
- test/test_double_metaphone.rb
|
46
|
-
- test/test_levenshtein.rb
|
47
|
-
- test/test_metaphone.rb
|
48
|
-
- test/test_porter_stemming.rb
|
49
44
|
- test/test_soundex.rb
|
45
|
+
- test/test_white_similarity.rb
|
46
|
+
- test/test_metaphone.rb
|
47
|
+
- test/test_levenshtein.rb
|
48
|
+
- test/test_double_metaphone.rb
|
50
49
|
- README.rdoc
|
51
50
|
- Rakefile
|
52
|
-
has_rdoc: true
|
53
51
|
homepage: http://github.com/threedaymonk/text
|
54
52
|
licenses: []
|
55
|
-
|
56
53
|
post_install_message:
|
57
54
|
rdoc_options: []
|
58
|
-
|
59
|
-
require_paths:
|
55
|
+
require_paths:
|
60
56
|
- lib
|
61
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
73
69
|
requirements: []
|
74
|
-
|
75
70
|
rubyforge_project: text
|
76
|
-
rubygems_version: 1.
|
71
|
+
rubygems_version: 1.8.11
|
77
72
|
signing_key:
|
78
73
|
specification_version: 3
|
79
74
|
summary: A collection of text algorithms
|
80
75
|
test_files: []
|
81
|
-
|