imatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +23 -0
- data/History.txt +6 -0
- data/Manifest.txt +9 -0
- data/README.txt +59 -0
- data/Rakefile +12 -0
- data/lib/data/en.dat +68342 -0
- data/lib/imatch.rb +85 -0
- data/lib/lexicon.rb +48 -0
- data/test/test_imatch.rb +81 -0
- data/test/test_lexicon.rb +76 -0
- metadata +128 -0
data/lib/imatch.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
require 'set'
|
3
|
+
require 'digest/sha1'
|
4
|
+
|
5
|
+
# gem install stemmer (Porter stemmer implementation)
|
6
|
+
require 'stemmer'
|
7
|
+
|
8
|
+
require 'lexicon'
|
9
|
+
|
10
|
+
class IMatch
|
11
|
+
VERSION = '0.1.0'
|
12
|
+
DEFAULT_LEXICON_FILE = File.join(File.dirname(__FILE__), 'data', 'en.dat')
|
13
|
+
DEFAULT_NUMBER_OF_LEXICONS = 0
|
14
|
+
DEFAULT_LEXICON_FRACTION = 0.66
|
15
|
+
|
16
|
+
def initialize(file = DEFAULT_LEXICON_FILE, options = {})
|
17
|
+
@lexicon = IMatch::Lexicon.new(file).freeze
|
18
|
+
@stop_words = (options[:stop_words] || []).to_set
|
19
|
+
|
20
|
+
@should_stem = !!options[:stemming]
|
21
|
+
|
22
|
+
@number_of_lexicons = (options[:lexicons] || DEFAULT_NUMBER_OF_LEXICONS).to_i
|
23
|
+
@lexicon_fraction = (options[:lexicon_fraction] || DEFAULT_LEXICON_FRACTION).to_f
|
24
|
+
@subsets = []
|
25
|
+
if @number_of_lexicons > 0
|
26
|
+
@number_of_lexicons.times { @subsets << @lexicon.subset(@lexicon_fraction) }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def multiple_signatures(string, tokenize = /\s+/)
|
31
|
+
signatures = Set.new
|
32
|
+
|
33
|
+
if sig = signature(string, tokenize)
|
34
|
+
signatures << sig
|
35
|
+
end
|
36
|
+
|
37
|
+
@subsets.each do |lex|
|
38
|
+
if sig = signature(string, tokenize, lex)
|
39
|
+
signatures << sig
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
signatures
|
44
|
+
end
|
45
|
+
|
46
|
+
def signature(string, tokenize = /\s+/, lexicon = nil)
|
47
|
+
return nil unless string
|
48
|
+
|
49
|
+
tokens = string.split(tokenize)
|
50
|
+
return nil if tokens.empty?
|
51
|
+
|
52
|
+
current_lexicon = lexicon || @lexicon
|
53
|
+
|
54
|
+
usable_tokens = Set.new
|
55
|
+
tokens.each do |t|
|
56
|
+
token = t.downcase
|
57
|
+
token = token.stem if @should_stem && token.respond_to?(:stem)
|
58
|
+
|
59
|
+
next if @stop_words.include?(token)
|
60
|
+
next unless current_lexicon.include?(token)
|
61
|
+
|
62
|
+
usable_tokens << token
|
63
|
+
end
|
64
|
+
|
65
|
+
return nil if usable_tokens.empty?
|
66
|
+
|
67
|
+
finger_print(usable_tokens.to_a.sort) unless tokens.empty?
|
68
|
+
end
|
69
|
+
|
70
|
+
def lexicon
|
71
|
+
@lexicon
|
72
|
+
end
|
73
|
+
|
74
|
+
def to_s
|
75
|
+
%Q{<IMatch stemming="#{@should_stem}" stop_word_count="#{@stop_words.size}">#{@lexicon.to_s}</IMatch>}
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def finger_print(tokens)
|
81
|
+
digest = Digest::SHA1.new
|
82
|
+
tokens.each{|t| digest.update(t) }
|
83
|
+
digest.to_s
|
84
|
+
end
|
85
|
+
end
|
data/lib/lexicon.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
|
2
|
+
class IMatch
|
3
|
+
class Lexicon
|
4
|
+
|
5
|
+
def initialize(file_or_set)
|
6
|
+
if file_or_set.kind_of?(Set)
|
7
|
+
@file = "N/A"
|
8
|
+
@data = file_or_set.clone.freeze
|
9
|
+
elsif file_or_set.kind_of?(File)
|
10
|
+
@file = File.expand_path(file_or_set.path)
|
11
|
+
@data = IO.read(@file).split(/\r?\n/).to_set.freeze
|
12
|
+
elsif file_or_set.kind_of?(String)
|
13
|
+
raise(InvalidLexiconError, "Invalid/missing lexicon file: #{file_or_set}") unless File.exist?(file_or_set)
|
14
|
+
@file = File.expand_path(file_or_set)
|
15
|
+
@data = IO.read(@file).split(/\r?\n/).to_set.freeze
|
16
|
+
else
|
17
|
+
raise(InvalidLexiconError, "Invalid/missing lexicon argument: #{file_or_set}")
|
18
|
+
end
|
19
|
+
|
20
|
+
raise(InvalidLexiconError, "Empty lexicon file: #{file_or_set}") if @data.empty?
|
21
|
+
end
|
22
|
+
|
23
|
+
def include?(key)
|
24
|
+
@data.include?(key)
|
25
|
+
end
|
26
|
+
|
27
|
+
def size
|
28
|
+
@data.size
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_s
|
32
|
+
%Q{<IMatch::Lexicon size="#{size}" file="#{@file}" />}
|
33
|
+
end
|
34
|
+
|
35
|
+
# percentage should be between 0.0 and 1.0
|
36
|
+
def subset(percentage)
|
37
|
+
subset = Set.new
|
38
|
+
@data.each do |term|
|
39
|
+
if rand > percentage
|
40
|
+
subset << term
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
self.class.new(subset)
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
data/test/test_imatch.rb
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "imatch"
|
3
|
+
|
4
|
+
class InvalidLexiconError < Exception; end;
|
5
|
+
|
6
|
+
class TestIMatch < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_defines_imatch_class
|
9
|
+
assert IMatch
|
10
|
+
assert IMatch.kind_of?(Class)
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_initalize_with_no_args_loads_the_default_lexicon
|
14
|
+
imatch = IMatch.new
|
15
|
+
assert imatch
|
16
|
+
assert imatch.lexicon, "expected a lexicon"
|
17
|
+
assert imatch.lexicon.size > 0, "Didn't expect a blank lexicon"
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_nil_input_creates_nil_output
|
21
|
+
assert_nil IMatch.new.signature(nil)
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_known_imatch_score
|
25
|
+
signature = IMatch.new.signature('foo bar')
|
26
|
+
assert signature.kind_of?(String)
|
27
|
+
assert_equal '60518c1c11dc0452be71a7118a43ab68e3451b82', signature
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_imatch_consistent
|
31
|
+
assert_equal IMatch.new.signature('foo bar'), IMatch.new.signature('foo bar')
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_imatch_unordered
|
35
|
+
assert_equal IMatch.new.signature('foo bar'), IMatch.new.signature('bar foo')
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_imatch_simple_plurals_if_stemming_enabled
|
39
|
+
imatch_stemming = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :stemming => true)
|
40
|
+
imatch_non_stemming = IMatch.new
|
41
|
+
assert_equal imatch_stemming.signature('follower'), imatch_stemming.signature('followers'), "Failed to stem when enabled"
|
42
|
+
assert_not_equal imatch_non_stemming.signature('follower'), imatch_non_stemming.signature('followers')
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_stop_words_skipped
|
46
|
+
imatch = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :stop_words => ['a'])
|
47
|
+
assert_nil imatch.signature("a")
|
48
|
+
assert_equal imatch.signature("foo"), imatch.signature("a foo")
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_skipping_unknown_terms
|
52
|
+
imatch = IMatch.new
|
53
|
+
assert !imatch.lexicon.include?('{{example}}')
|
54
|
+
assert_nil imatch.signature('{{example}}')
|
55
|
+
assert_equal imatch.signature("string"), imatch.signature("{{example}} string")
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_alternate_splitting
|
59
|
+
assert_equal IMatch.new.signature('F 16'), IMatch.new.signature('F-16', /\W+/)
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_to_s
|
63
|
+
imatch = IMatch.new
|
64
|
+
str = imatch.to_s
|
65
|
+
assert str.include?("stemming=\"false\"")
|
66
|
+
assert str.include?("stop_word_count=\"0\"")
|
67
|
+
assert str.include?(imatch.lexicon.to_s)
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_multiple_lexicon_signatures
|
71
|
+
string = "this is a test"
|
72
|
+
imatch = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :lexicons => 5)
|
73
|
+
|
74
|
+
default = imatch.signature(string)
|
75
|
+
signatures = imatch.multiple_signatures(string)
|
76
|
+
|
77
|
+
assert signatures.kind_of?(Set)
|
78
|
+
assert !signatures.empty?
|
79
|
+
assert signatures.include?(default)
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "imatch"
|
3
|
+
|
4
|
+
class TestIMatchLexicon < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_defines_lexicon_class
|
7
|
+
assert IMatch::Lexicon
|
8
|
+
assert IMatch::Lexicon.kind_of?(Class)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_nil_file_raises_error
|
12
|
+
assert_raise InvalidLexiconError do
|
13
|
+
IMatch::Lexicon.new(nil)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_missing_file_raises_error
|
18
|
+
assert_raise InvalidLexiconError do
|
19
|
+
IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'not_such_file'))
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_empty_file_raises_error
|
24
|
+
assert_raise InvalidLexiconError do
|
25
|
+
IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'empty.dat'))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_lexicon_size
|
30
|
+
lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
|
31
|
+
assert_equal 10, lexicon.size
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_lexicon_duplicates_ignored
|
35
|
+
lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'duplicates.dat'))
|
36
|
+
assert_equal 7, lexicon.size
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_lexicon_include
|
40
|
+
lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
|
41
|
+
%w(this file has ten terms in the lexicon for testing).each do |term|
|
42
|
+
assert lexicon.include?(term), "Lexicon did not include test term: #{term}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_to_s
|
47
|
+
filename = File.expand_path(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
|
48
|
+
lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
|
49
|
+
assert_match(/#{filename}/, lexicon.to_s)
|
50
|
+
assert_match(/#{lexicon.size}/, lexicon.to_s)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_new_with_file_argument
|
54
|
+
file = File.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
|
55
|
+
lexicon = IMatch::Lexicon.new(file)
|
56
|
+
assert_equal 10, lexicon.size
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_new_with_set_argument
|
60
|
+
lexicon = IMatch::Lexicon.new(%w(a b c d).to_set)
|
61
|
+
assert_equal 4, lexicon.size
|
62
|
+
assert_match(/N\/A/, lexicon.to_s)
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_random_subset
|
66
|
+
lexicon = IMatch::Lexicon.new(IMatch::DEFAULT_LEXICON_FILE)
|
67
|
+
assert lexicon.size > 10000, "Default lexicon is too small for this test"
|
68
|
+
|
69
|
+
subset = lexicon.subset(0.5)
|
70
|
+
portion = (subset.size.to_f / lexicon.size.to_f).to_f
|
71
|
+
|
72
|
+
assert portion > 0.4, "A 50% subset should be >40% of the size or else random is not working (#{portion})"
|
73
|
+
assert portion < 0.6, "A 50% subset should be <60% of the size or else random is not working (#{portion})"
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
metadata
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: imatch
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Matt Sanford
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-09-01 00:00:00 -07:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: stemmer
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 21
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 0
|
33
|
+
- 1
|
34
|
+
version: 1.0.1
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: rubyforge
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 7
|
46
|
+
segments:
|
47
|
+
- 2
|
48
|
+
- 0
|
49
|
+
- 4
|
50
|
+
version: 2.0.4
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: hoe
|
55
|
+
prerelease: false
|
56
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
hash: 21
|
62
|
+
segments:
|
63
|
+
- 2
|
64
|
+
- 6
|
65
|
+
- 1
|
66
|
+
version: 2.6.1
|
67
|
+
type: :development
|
68
|
+
version_requirements: *id003
|
69
|
+
description: An implementation of the IMatch algorithm as described at http://www.ir.iit.edu/~abdur/Research/Duplicate.html
|
70
|
+
email:
|
71
|
+
- matt@twitter.com
|
72
|
+
executables: []
|
73
|
+
|
74
|
+
extensions: []
|
75
|
+
|
76
|
+
extra_rdoc_files:
|
77
|
+
- History.txt
|
78
|
+
- Manifest.txt
|
79
|
+
- README.txt
|
80
|
+
files:
|
81
|
+
- .autotest
|
82
|
+
- History.txt
|
83
|
+
- Manifest.txt
|
84
|
+
- README.txt
|
85
|
+
- Rakefile
|
86
|
+
- lib/imatch.rb
|
87
|
+
- lib/lexicon.rb
|
88
|
+
- lib/data/en.dat
|
89
|
+
- test/test_imatch.rb
|
90
|
+
- test/test_lexicon.rb
|
91
|
+
has_rdoc: true
|
92
|
+
homepage: http://twitter.com/mzsanford
|
93
|
+
licenses: []
|
94
|
+
|
95
|
+
post_install_message:
|
96
|
+
rdoc_options:
|
97
|
+
- --main
|
98
|
+
- README.txt
|
99
|
+
require_paths:
|
100
|
+
- lib
|
101
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - ">="
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
hash: 3
|
107
|
+
segments:
|
108
|
+
- 0
|
109
|
+
version: "0"
|
110
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
hash: 3
|
116
|
+
segments:
|
117
|
+
- 0
|
118
|
+
version: "0"
|
119
|
+
requirements: []
|
120
|
+
|
121
|
+
rubyforge_project: imatch
|
122
|
+
rubygems_version: 1.3.7
|
123
|
+
signing_key:
|
124
|
+
specification_version: 3
|
125
|
+
summary: An implementation of the IMatch algorithm as described at http://www.ir.iit.edu/~abdur/Research/Duplicate.html
|
126
|
+
test_files:
|
127
|
+
- test/test_imatch.rb
|
128
|
+
- test/test_lexicon.rb
|