imatch 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,85 @@
1
+
2
+ require 'set'
3
+ require 'digest/sha1'
4
+
5
+ # gem install stemmer (Porter stemmer implementation)
6
+ require 'stemmer'
7
+
8
+ require 'lexicon'
9
+
10
+ class IMatch
11
+ VERSION = '0.1.0'
12
+ DEFAULT_LEXICON_FILE = File.join(File.dirname(__FILE__), 'data', 'en.dat')
13
+ DEFAULT_NUMBER_OF_LEXICONS = 0
14
+ DEFAULT_LEXICON_FRACTION = 0.66
15
+
16
+ def initialize(file = DEFAULT_LEXICON_FILE, options = {})
17
+ @lexicon = IMatch::Lexicon.new(file).freeze
18
+ @stop_words = (options[:stop_words] || []).to_set
19
+
20
+ @should_stem = !!options[:stemming]
21
+
22
+ @number_of_lexicons = (options[:lexicons] || DEFAULT_NUMBER_OF_LEXICONS).to_i
23
+ @lexicon_fraction = (options[:lexicon_fraction] || DEFAULT_LEXICON_FRACTION).to_f
24
+ @subsets = []
25
+ if @number_of_lexicons > 0
26
+ @number_of_lexicons.times { @subsets << @lexicon.subset(@lexicon_fraction) }
27
+ end
28
+ end
29
+
30
+ def multiple_signatures(string, tokenize = /\s+/)
31
+ signatures = Set.new
32
+
33
+ if sig = signature(string, tokenize)
34
+ signatures << sig
35
+ end
36
+
37
+ @subsets.each do |lex|
38
+ if sig = signature(string, tokenize, lex)
39
+ signatures << sig
40
+ end
41
+ end
42
+
43
+ signatures
44
+ end
45
+
46
+ def signature(string, tokenize = /\s+/, lexicon = nil)
47
+ return nil unless string
48
+
49
+ tokens = string.split(tokenize)
50
+ return nil if tokens.empty?
51
+
52
+ current_lexicon = lexicon || @lexicon
53
+
54
+ usable_tokens = Set.new
55
+ tokens.each do |t|
56
+ token = t.downcase
57
+ token = token.stem if @should_stem && token.respond_to?(:stem)
58
+
59
+ next if @stop_words.include?(token)
60
+ next unless current_lexicon.include?(token)
61
+
62
+ usable_tokens << token
63
+ end
64
+
65
+ return nil if usable_tokens.empty?
66
+
67
+ finger_print(usable_tokens.to_a.sort) unless tokens.empty?
68
+ end
69
+
70
+ def lexicon
71
+ @lexicon
72
+ end
73
+
74
+ def to_s
75
+ %Q{<IMatch stemming="#{@should_stem}" stop_word_count="#{@stop_words.size}">#{@lexicon.to_s}</IMatch>}
76
+ end
77
+
78
+ private
79
+
80
+ def finger_print(tokens)
81
+ digest = Digest::SHA1.new
82
+ tokens.each{|t| digest.update(t) }
83
+ digest.to_s
84
+ end
85
+ end
@@ -0,0 +1,48 @@
1
+
2
+ class IMatch
3
+ class Lexicon
4
+
5
+ def initialize(file_or_set)
6
+ if file_or_set.kind_of?(Set)
7
+ @file = "N/A"
8
+ @data = file_or_set.clone.freeze
9
+ elsif file_or_set.kind_of?(File)
10
+ @file = File.expand_path(file_or_set.path)
11
+ @data = IO.read(@file).split(/\r?\n/).to_set.freeze
12
+ elsif file_or_set.kind_of?(String)
13
+ raise(InvalidLexiconError, "Invalid/missing lexicon file: #{file_or_set}") unless File.exist?(file_or_set)
14
+ @file = File.expand_path(file_or_set)
15
+ @data = IO.read(@file).split(/\r?\n/).to_set.freeze
16
+ else
17
+ raise(InvalidLexiconError, "Invalid/missing lexicon argument: #{file_or_set}")
18
+ end
19
+
20
+ raise(InvalidLexiconError, "Empty lexicon file: #{file_or_set}") if @data.empty?
21
+ end
22
+
23
+ def include?(key)
24
+ @data.include?(key)
25
+ end
26
+
27
+ def size
28
+ @data.size
29
+ end
30
+
31
+ def to_s
32
+ %Q{<IMatch::Lexicon size="#{size}" file="#{@file}" />}
33
+ end
34
+
35
+ # percentage should be between 0.0 and 1.0
36
+ def subset(percentage)
37
+ subset = Set.new
38
+ @data.each do |term|
39
+ if rand > percentage
40
+ subset << term
41
+ end
42
+ end
43
+
44
+ self.class.new(subset)
45
+ end
46
+
47
+ end
48
+ end
@@ -0,0 +1,81 @@
1
+ require "test/unit"
2
+ require "imatch"
3
+
4
+ class InvalidLexiconError < Exception; end;
5
+
6
+ class TestIMatch < Test::Unit::TestCase
7
+
8
+ def test_defines_imatch_class
9
+ assert IMatch
10
+ assert IMatch.kind_of?(Class)
11
+ end
12
+
13
+ def test_initalize_with_no_args_loads_the_default_lexicon
14
+ imatch = IMatch.new
15
+ assert imatch
16
+ assert imatch.lexicon, "expected a lexicon"
17
+ assert imatch.lexicon.size > 0, "Didn't expect a blank lexicon"
18
+ end
19
+
20
+ def test_nil_input_creates_nil_output
21
+ assert_nil IMatch.new.signature(nil)
22
+ end
23
+
24
+ def test_known_imatch_score
25
+ signature = IMatch.new.signature('foo bar')
26
+ assert signature.kind_of?(String)
27
+ assert_equal '60518c1c11dc0452be71a7118a43ab68e3451b82', signature
28
+ end
29
+
30
+ def test_imatch_consistent
31
+ assert_equal IMatch.new.signature('foo bar'), IMatch.new.signature('foo bar')
32
+ end
33
+
34
+ def test_imatch_unordered
35
+ assert_equal IMatch.new.signature('foo bar'), IMatch.new.signature('bar foo')
36
+ end
37
+
38
+ def test_imatch_simple_plurals_if_stemming_enabled
39
+ imatch_stemming = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :stemming => true)
40
+ imatch_non_stemming = IMatch.new
41
+ assert_equal imatch_stemming.signature('follower'), imatch_stemming.signature('followers'), "Failed to stem when enabled"
42
+ assert_not_equal imatch_non_stemming.signature('follower'), imatch_non_stemming.signature('followers')
43
+ end
44
+
45
+ def test_stop_words_skipped
46
+ imatch = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :stop_words => ['a'])
47
+ assert_nil imatch.signature("a")
48
+ assert_equal imatch.signature("foo"), imatch.signature("a foo")
49
+ end
50
+
51
+ def test_skipping_unknown_terms
52
+ imatch = IMatch.new
53
+ assert !imatch.lexicon.include?('{{example}}')
54
+ assert_nil imatch.signature('{{example}}')
55
+ assert_equal imatch.signature("string"), imatch.signature("{{example}} string")
56
+ end
57
+
58
+ def test_alternate_splitting
59
+ assert_equal IMatch.new.signature('F 16'), IMatch.new.signature('F-16', /\W+/)
60
+ end
61
+
62
+ def test_to_s
63
+ imatch = IMatch.new
64
+ str = imatch.to_s
65
+ assert str.include?("stemming=\"false\"")
66
+ assert str.include?("stop_word_count=\"0\"")
67
+ assert str.include?(imatch.lexicon.to_s)
68
+ end
69
+
70
+ def test_multiple_lexicon_signatures
71
+ string = "this is a test"
72
+ imatch = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :lexicons => 5)
73
+
74
+ default = imatch.signature(string)
75
+ signatures = imatch.multiple_signatures(string)
76
+
77
+ assert signatures.kind_of?(Set)
78
+ assert !signatures.empty?
79
+ assert signatures.include?(default)
80
+ end
81
+ end
@@ -0,0 +1,76 @@
1
+ require "test/unit"
2
+ require "imatch"
3
+
4
+ class TestIMatchLexicon < Test::Unit::TestCase
5
+
6
+ def test_defines_lexicon_class
7
+ assert IMatch::Lexicon
8
+ assert IMatch::Lexicon.kind_of?(Class)
9
+ end
10
+
11
+ def test_nil_file_raises_error
12
+ assert_raise InvalidLexiconError do
13
+ IMatch::Lexicon.new(nil)
14
+ end
15
+ end
16
+
17
+ def test_missing_file_raises_error
18
+ assert_raise InvalidLexiconError do
19
+ IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'not_such_file'))
20
+ end
21
+ end
22
+
23
+ def test_empty_file_raises_error
24
+ assert_raise InvalidLexiconError do
25
+ IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'empty.dat'))
26
+ end
27
+ end
28
+
29
+ def test_lexicon_size
30
+ lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
31
+ assert_equal 10, lexicon.size
32
+ end
33
+
34
+ def test_lexicon_duplicates_ignored
35
+ lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'duplicates.dat'))
36
+ assert_equal 7, lexicon.size
37
+ end
38
+
39
+ def test_lexicon_include
40
+ lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
41
+ %w(this file has ten terms in the lexicon for testing).each do |term|
42
+ assert lexicon.include?(term), "Lexicon did not include test term: #{term}"
43
+ end
44
+ end
45
+
46
+ def test_to_s
47
+ filename = File.expand_path(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
48
+ lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
49
+ assert_match(/#{filename}/, lexicon.to_s)
50
+ assert_match(/#{lexicon.size}/, lexicon.to_s)
51
+ end
52
+
53
+ def test_new_with_file_argument
54
+ file = File.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
55
+ lexicon = IMatch::Lexicon.new(file)
56
+ assert_equal 10, lexicon.size
57
+ end
58
+
59
+ def test_new_with_set_argument
60
+ lexicon = IMatch::Lexicon.new(%w(a b c d).to_set)
61
+ assert_equal 4, lexicon.size
62
+ assert_match(/N\/A/, lexicon.to_s)
63
+ end
64
+
65
+ def test_random_subset
66
+ lexicon = IMatch::Lexicon.new(IMatch::DEFAULT_LEXICON_FILE)
67
+ assert lexicon.size > 10000, "Default lexicon is too small for this test"
68
+
69
+ subset = lexicon.subset(0.5)
70
+ portion = (subset.size.to_f / lexicon.size.to_f).to_f
71
+
72
+ assert portion > 0.4, "A 50% subset should be >40% of the size or else random is not working (#{portion})"
73
+ assert portion < 0.6, "A 50% subset should be <60% of the size or else random is not working (#{portion})"
74
+ end
75
+
76
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: imatch
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Matt Sanford
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-09-01 00:00:00 -07:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: stemmer
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ hash: 21
30
+ segments:
31
+ - 1
32
+ - 0
33
+ - 1
34
+ version: 1.0.1
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: rubyforge
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 7
46
+ segments:
47
+ - 2
48
+ - 0
49
+ - 4
50
+ version: 2.0.4
51
+ type: :development
52
+ version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: hoe
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 21
62
+ segments:
63
+ - 2
64
+ - 6
65
+ - 1
66
+ version: 2.6.1
67
+ type: :development
68
+ version_requirements: *id003
69
+ description: An implementation of the IMatch algorithm as described at http://www.ir.iit.edu/~abdur/Research/Duplicate.html
70
+ email:
71
+ - matt@twitter.com
72
+ executables: []
73
+
74
+ extensions: []
75
+
76
+ extra_rdoc_files:
77
+ - History.txt
78
+ - Manifest.txt
79
+ - README.txt
80
+ files:
81
+ - .autotest
82
+ - History.txt
83
+ - Manifest.txt
84
+ - README.txt
85
+ - Rakefile
86
+ - lib/imatch.rb
87
+ - lib/lexicon.rb
88
+ - lib/data/en.dat
89
+ - test/test_imatch.rb
90
+ - test/test_lexicon.rb
91
+ has_rdoc: true
92
+ homepage: http://twitter.com/mzsanford
93
+ licenses: []
94
+
95
+ post_install_message:
96
+ rdoc_options:
97
+ - --main
98
+ - README.txt
99
+ require_paths:
100
+ - lib
101
+ required_ruby_version: !ruby/object:Gem::Requirement
102
+ none: false
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ hash: 3
107
+ segments:
108
+ - 0
109
+ version: "0"
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ hash: 3
116
+ segments:
117
+ - 0
118
+ version: "0"
119
+ requirements: []
120
+
121
+ rubyforge_project: imatch
122
+ rubygems_version: 1.3.7
123
+ signing_key:
124
+ specification_version: 3
125
+ summary: An implementation of the IMatch algorithm as described at http://www.ir.iit.edu/~abdur/Research/Duplicate.html
126
+ test_files:
127
+ - test/test_imatch.rb
128
+ - test/test_lexicon.rb