imatch 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +23 -0
- data/History.txt +6 -0
- data/Manifest.txt +9 -0
- data/README.txt +59 -0
- data/Rakefile +12 -0
- data/lib/data/en.dat +68342 -0
- data/lib/imatch.rb +85 -0
- data/lib/lexicon.rb +48 -0
- data/test/test_imatch.rb +81 -0
- data/test/test_lexicon.rb +76 -0
- metadata +128 -0
data/lib/imatch.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
require 'set'
|
3
|
+
require 'digest/sha1'
|
4
|
+
|
5
|
+
# gem install stemmer (Porter stemmer implementation)
|
6
|
+
require 'stemmer'
|
7
|
+
|
8
|
+
require 'lexicon'
|
9
|
+
|
10
|
+
class IMatch
|
11
|
+
VERSION = '0.1.0'
|
12
|
+
DEFAULT_LEXICON_FILE = File.join(File.dirname(__FILE__), 'data', 'en.dat')
|
13
|
+
DEFAULT_NUMBER_OF_LEXICONS = 0
|
14
|
+
DEFAULT_LEXICON_FRACTION = 0.66
|
15
|
+
|
16
|
+
def initialize(file = DEFAULT_LEXICON_FILE, options = {})
|
17
|
+
@lexicon = IMatch::Lexicon.new(file).freeze
|
18
|
+
@stop_words = (options[:stop_words] || []).to_set
|
19
|
+
|
20
|
+
@should_stem = !!options[:stemming]
|
21
|
+
|
22
|
+
@number_of_lexicons = (options[:lexicons] || DEFAULT_NUMBER_OF_LEXICONS).to_i
|
23
|
+
@lexicon_fraction = (options[:lexicon_fraction] || DEFAULT_LEXICON_FRACTION).to_f
|
24
|
+
@subsets = []
|
25
|
+
if @number_of_lexicons > 0
|
26
|
+
@number_of_lexicons.times { @subsets << @lexicon.subset(@lexicon_fraction) }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def multiple_signatures(string, tokenize = /\s+/)
|
31
|
+
signatures = Set.new
|
32
|
+
|
33
|
+
if sig = signature(string, tokenize)
|
34
|
+
signatures << sig
|
35
|
+
end
|
36
|
+
|
37
|
+
@subsets.each do |lex|
|
38
|
+
if sig = signature(string, tokenize, lex)
|
39
|
+
signatures << sig
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
signatures
|
44
|
+
end
|
45
|
+
|
46
|
+
def signature(string, tokenize = /\s+/, lexicon = nil)
|
47
|
+
return nil unless string
|
48
|
+
|
49
|
+
tokens = string.split(tokenize)
|
50
|
+
return nil if tokens.empty?
|
51
|
+
|
52
|
+
current_lexicon = lexicon || @lexicon
|
53
|
+
|
54
|
+
usable_tokens = Set.new
|
55
|
+
tokens.each do |t|
|
56
|
+
token = t.downcase
|
57
|
+
token = token.stem if @should_stem && token.respond_to?(:stem)
|
58
|
+
|
59
|
+
next if @stop_words.include?(token)
|
60
|
+
next unless current_lexicon.include?(token)
|
61
|
+
|
62
|
+
usable_tokens << token
|
63
|
+
end
|
64
|
+
|
65
|
+
return nil if usable_tokens.empty?
|
66
|
+
|
67
|
+
finger_print(usable_tokens.to_a.sort) unless tokens.empty?
|
68
|
+
end
|
69
|
+
|
70
|
+
def lexicon
|
71
|
+
@lexicon
|
72
|
+
end
|
73
|
+
|
74
|
+
def to_s
|
75
|
+
%Q{<IMatch stemming="#{@should_stem}" stop_word_count="#{@stop_words.size}">#{@lexicon.to_s}</IMatch>}
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def finger_print(tokens)
|
81
|
+
digest = Digest::SHA1.new
|
82
|
+
tokens.each{|t| digest.update(t) }
|
83
|
+
digest.to_s
|
84
|
+
end
|
85
|
+
end
|
data/lib/lexicon.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
|
2
|
+
class IMatch
|
3
|
+
class Lexicon
|
4
|
+
|
5
|
+
def initialize(file_or_set)
|
6
|
+
if file_or_set.kind_of?(Set)
|
7
|
+
@file = "N/A"
|
8
|
+
@data = file_or_set.clone.freeze
|
9
|
+
elsif file_or_set.kind_of?(File)
|
10
|
+
@file = File.expand_path(file_or_set.path)
|
11
|
+
@data = IO.read(@file).split(/\r?\n/).to_set.freeze
|
12
|
+
elsif file_or_set.kind_of?(String)
|
13
|
+
raise(InvalidLexiconError, "Invalid/missing lexicon file: #{file_or_set}") unless File.exist?(file_or_set)
|
14
|
+
@file = File.expand_path(file_or_set)
|
15
|
+
@data = IO.read(@file).split(/\r?\n/).to_set.freeze
|
16
|
+
else
|
17
|
+
raise(InvalidLexiconError, "Invalid/missing lexicon argument: #{file_or_set}")
|
18
|
+
end
|
19
|
+
|
20
|
+
raise(InvalidLexiconError, "Empty lexicon file: #{file_or_set}") if @data.empty?
|
21
|
+
end
|
22
|
+
|
23
|
+
def include?(key)
|
24
|
+
@data.include?(key)
|
25
|
+
end
|
26
|
+
|
27
|
+
def size
|
28
|
+
@data.size
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_s
|
32
|
+
%Q{<IMatch::Lexicon size="#{size}" file="#{@file}" />}
|
33
|
+
end
|
34
|
+
|
35
|
+
# percentage should be between 0.0 and 1.0
|
36
|
+
def subset(percentage)
|
37
|
+
subset = Set.new
|
38
|
+
@data.each do |term|
|
39
|
+
if rand > percentage
|
40
|
+
subset << term
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
self.class.new(subset)
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
data/test/test_imatch.rb
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "imatch"
|
3
|
+
|
4
|
+
class InvalidLexiconError < Exception; end;
|
5
|
+
|
6
|
+
class TestIMatch < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_defines_imatch_class
|
9
|
+
assert IMatch
|
10
|
+
assert IMatch.kind_of?(Class)
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_initalize_with_no_args_loads_the_default_lexicon
|
14
|
+
imatch = IMatch.new
|
15
|
+
assert imatch
|
16
|
+
assert imatch.lexicon, "expected a lexicon"
|
17
|
+
assert imatch.lexicon.size > 0, "Didn't expect a blank lexicon"
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_nil_input_creates_nil_output
|
21
|
+
assert_nil IMatch.new.signature(nil)
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_known_imatch_score
|
25
|
+
signature = IMatch.new.signature('foo bar')
|
26
|
+
assert signature.kind_of?(String)
|
27
|
+
assert_equal '60518c1c11dc0452be71a7118a43ab68e3451b82', signature
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_imatch_consistent
|
31
|
+
assert_equal IMatch.new.signature('foo bar'), IMatch.new.signature('foo bar')
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_imatch_unordered
|
35
|
+
assert_equal IMatch.new.signature('foo bar'), IMatch.new.signature('bar foo')
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_imatch_simple_plurals_if_stemming_enabled
|
39
|
+
imatch_stemming = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :stemming => true)
|
40
|
+
imatch_non_stemming = IMatch.new
|
41
|
+
assert_equal imatch_stemming.signature('follower'), imatch_stemming.signature('followers'), "Failed to stem when enabled"
|
42
|
+
assert_not_equal imatch_non_stemming.signature('follower'), imatch_non_stemming.signature('followers')
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_stop_words_skipped
|
46
|
+
imatch = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :stop_words => ['a'])
|
47
|
+
assert_nil imatch.signature("a")
|
48
|
+
assert_equal imatch.signature("foo"), imatch.signature("a foo")
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_skipping_unknown_terms
|
52
|
+
imatch = IMatch.new
|
53
|
+
assert !imatch.lexicon.include?('{{example}}')
|
54
|
+
assert_nil imatch.signature('{{example}}')
|
55
|
+
assert_equal imatch.signature("string"), imatch.signature("{{example}} string")
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_alternate_splitting
|
59
|
+
assert_equal IMatch.new.signature('F 16'), IMatch.new.signature('F-16', /\W+/)
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_to_s
|
63
|
+
imatch = IMatch.new
|
64
|
+
str = imatch.to_s
|
65
|
+
assert str.include?("stemming=\"false\"")
|
66
|
+
assert str.include?("stop_word_count=\"0\"")
|
67
|
+
assert str.include?(imatch.lexicon.to_s)
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_multiple_lexicon_signatures
|
71
|
+
string = "this is a test"
|
72
|
+
imatch = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :lexicons => 5)
|
73
|
+
|
74
|
+
default = imatch.signature(string)
|
75
|
+
signatures = imatch.multiple_signatures(string)
|
76
|
+
|
77
|
+
assert signatures.kind_of?(Set)
|
78
|
+
assert !signatures.empty?
|
79
|
+
assert signatures.include?(default)
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "imatch"
|
3
|
+
|
4
|
+
class TestIMatchLexicon < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_defines_lexicon_class
|
7
|
+
assert IMatch::Lexicon
|
8
|
+
assert IMatch::Lexicon.kind_of?(Class)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_nil_file_raises_error
|
12
|
+
assert_raise InvalidLexiconError do
|
13
|
+
IMatch::Lexicon.new(nil)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_missing_file_raises_error
|
18
|
+
assert_raise InvalidLexiconError do
|
19
|
+
IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'not_such_file'))
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_empty_file_raises_error
|
24
|
+
assert_raise InvalidLexiconError do
|
25
|
+
IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'empty.dat'))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_lexicon_size
|
30
|
+
lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
|
31
|
+
assert_equal 10, lexicon.size
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_lexicon_duplicates_ignored
|
35
|
+
lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'duplicates.dat'))
|
36
|
+
assert_equal 7, lexicon.size
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_lexicon_include
|
40
|
+
lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
|
41
|
+
%w(this file has ten terms in the lexicon for testing).each do |term|
|
42
|
+
assert lexicon.include?(term), "Lexicon did not include test term: #{term}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_to_s
|
47
|
+
filename = File.expand_path(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
|
48
|
+
lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
|
49
|
+
assert_match(/#{filename}/, lexicon.to_s)
|
50
|
+
assert_match(/#{lexicon.size}/, lexicon.to_s)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_new_with_file_argument
|
54
|
+
file = File.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
|
55
|
+
lexicon = IMatch::Lexicon.new(file)
|
56
|
+
assert_equal 10, lexicon.size
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_new_with_set_argument
|
60
|
+
lexicon = IMatch::Lexicon.new(%w(a b c d).to_set)
|
61
|
+
assert_equal 4, lexicon.size
|
62
|
+
assert_match(/N\/A/, lexicon.to_s)
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_random_subset
|
66
|
+
lexicon = IMatch::Lexicon.new(IMatch::DEFAULT_LEXICON_FILE)
|
67
|
+
assert lexicon.size > 10000, "Default lexicon is too small for this test"
|
68
|
+
|
69
|
+
subset = lexicon.subset(0.5)
|
70
|
+
portion = (subset.size.to_f / lexicon.size.to_f).to_f
|
71
|
+
|
72
|
+
assert portion > 0.4, "A 50% subset should be >40% of the size or else random is not working (#{portion})"
|
73
|
+
assert portion < 0.6, "A 50% subset should be <60% of the size or else random is not working (#{portion})"
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
metadata
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: imatch
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Matt Sanford
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-09-01 00:00:00 -07:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: stemmer
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 21
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 0
|
33
|
+
- 1
|
34
|
+
version: 1.0.1
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: rubyforge
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 7
|
46
|
+
segments:
|
47
|
+
- 2
|
48
|
+
- 0
|
49
|
+
- 4
|
50
|
+
version: 2.0.4
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: hoe
|
55
|
+
prerelease: false
|
56
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
hash: 21
|
62
|
+
segments:
|
63
|
+
- 2
|
64
|
+
- 6
|
65
|
+
- 1
|
66
|
+
version: 2.6.1
|
67
|
+
type: :development
|
68
|
+
version_requirements: *id003
|
69
|
+
description: An implementation of the IMatch algorithm as described at http://www.ir.iit.edu/~abdur/Research/Duplicate.html
|
70
|
+
email:
|
71
|
+
- matt@twitter.com
|
72
|
+
executables: []
|
73
|
+
|
74
|
+
extensions: []
|
75
|
+
|
76
|
+
extra_rdoc_files:
|
77
|
+
- History.txt
|
78
|
+
- Manifest.txt
|
79
|
+
- README.txt
|
80
|
+
files:
|
81
|
+
- .autotest
|
82
|
+
- History.txt
|
83
|
+
- Manifest.txt
|
84
|
+
- README.txt
|
85
|
+
- Rakefile
|
86
|
+
- lib/imatch.rb
|
87
|
+
- lib/lexicon.rb
|
88
|
+
- lib/data/en.dat
|
89
|
+
- test/test_imatch.rb
|
90
|
+
- test/test_lexicon.rb
|
91
|
+
has_rdoc: true
|
92
|
+
homepage: http://twitter.com/mzsanford
|
93
|
+
licenses: []
|
94
|
+
|
95
|
+
post_install_message:
|
96
|
+
rdoc_options:
|
97
|
+
- --main
|
98
|
+
- README.txt
|
99
|
+
require_paths:
|
100
|
+
- lib
|
101
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
hash: 3
|
107
|
+
segments:
|
108
|
+
- 0
|
109
|
+
version: "0"
|
110
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
hash: 3
|
116
|
+
segments:
|
117
|
+
- 0
|
118
|
+
version: "0"
|
119
|
+
requirements: []
|
120
|
+
|
121
|
+
rubyforge_project: imatch
|
122
|
+
rubygems_version: 1.3.7
|
123
|
+
signing_key:
|
124
|
+
specification_version: 3
|
125
|
+
summary: An implementation of the IMatch algorithm as described at http://www.ir.iit.edu/~abdur/Research/Duplicate.html
|
126
|
+
test_files:
|
127
|
+
- test/test_imatch.rb
|
128
|
+
- test/test_lexicon.rb
|