imatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,85 @@
1
+
2
+ require 'set'
3
+ require 'digest/sha1'
4
+
5
+ # gem install stemmer (Porter stemmer implementation)
6
+ require 'stemmer'
7
+
8
+ require 'lexicon'
9
+
10
+ class IMatch
11
+ VERSION = '0.1.0'
12
+ DEFAULT_LEXICON_FILE = File.join(File.dirname(__FILE__), 'data', 'en.dat')
13
+ DEFAULT_NUMBER_OF_LEXICONS = 0
14
+ DEFAULT_LEXICON_FRACTION = 0.66
15
+
16
+ def initialize(file = DEFAULT_LEXICON_FILE, options = {})
17
+ @lexicon = IMatch::Lexicon.new(file).freeze
18
+ @stop_words = (options[:stop_words] || []).to_set
19
+
20
+ @should_stem = !!options[:stemming]
21
+
22
+ @number_of_lexicons = (options[:lexicons] || DEFAULT_NUMBER_OF_LEXICONS).to_i
23
+ @lexicon_fraction = (options[:lexicon_fraction] || DEFAULT_LEXICON_FRACTION).to_f
24
+ @subsets = []
25
+ if @number_of_lexicons > 0
26
+ @number_of_lexicons.times { @subsets << @lexicon.subset(@lexicon_fraction) }
27
+ end
28
+ end
29
+
30
+ def multiple_signatures(string, tokenize = /\s+/)
31
+ signatures = Set.new
32
+
33
+ if sig = signature(string, tokenize)
34
+ signatures << sig
35
+ end
36
+
37
+ @subsets.each do |lex|
38
+ if sig = signature(string, tokenize, lex)
39
+ signatures << sig
40
+ end
41
+ end
42
+
43
+ signatures
44
+ end
45
+
46
+ def signature(string, tokenize = /\s+/, lexicon = nil)
47
+ return nil unless string
48
+
49
+ tokens = string.split(tokenize)
50
+ return nil if tokens.empty?
51
+
52
+ current_lexicon = lexicon || @lexicon
53
+
54
+ usable_tokens = Set.new
55
+ tokens.each do |t|
56
+ token = t.downcase
57
+ token = token.stem if @should_stem && token.respond_to?(:stem)
58
+
59
+ next if @stop_words.include?(token)
60
+ next unless current_lexicon.include?(token)
61
+
62
+ usable_tokens << token
63
+ end
64
+
65
+ return nil if usable_tokens.empty?
66
+
67
+ finger_print(usable_tokens.to_a.sort) unless tokens.empty?
68
+ end
69
+
70
+ def lexicon
71
+ @lexicon
72
+ end
73
+
74
+ def to_s
75
+ %Q{<IMatch stemming="#{@should_stem}" stop_word_count="#{@stop_words.size}">#{@lexicon.to_s}</IMatch>}
76
+ end
77
+
78
+ private
79
+
80
+ def finger_print(tokens)
81
+ digest = Digest::SHA1.new
82
+ tokens.each{|t| digest.update(t) }
83
+ digest.to_s
84
+ end
85
+ end
@@ -0,0 +1,48 @@
1
+
2
+ class IMatch
3
+ class Lexicon
4
+
5
+ def initialize(file_or_set)
6
+ if file_or_set.kind_of?(Set)
7
+ @file = "N/A"
8
+ @data = file_or_set.clone.freeze
9
+ elsif file_or_set.kind_of?(File)
10
+ @file = File.expand_path(file_or_set.path)
11
+ @data = IO.read(@file).split(/\r?\n/).to_set.freeze
12
+ elsif file_or_set.kind_of?(String)
13
+ raise(InvalidLexiconError, "Invalid/missing lexicon file: #{file_or_set}") unless File.exist?(file_or_set)
14
+ @file = File.expand_path(file_or_set)
15
+ @data = IO.read(@file).split(/\r?\n/).to_set.freeze
16
+ else
17
+ raise(InvalidLexiconError, "Invalid/missing lexicon argument: #{file_or_set}")
18
+ end
19
+
20
+ raise(InvalidLexiconError, "Empty lexicon file: #{file_or_set}") if @data.empty?
21
+ end
22
+
23
+ def include?(key)
24
+ @data.include?(key)
25
+ end
26
+
27
+ def size
28
+ @data.size
29
+ end
30
+
31
+ def to_s
32
+ %Q{<IMatch::Lexicon size="#{size}" file="#{@file}" />}
33
+ end
34
+
35
+ # percentage should be between 0.0 and 1.0
36
+ def subset(percentage)
37
+ subset = Set.new
38
+ @data.each do |term|
39
+ if rand > percentage
40
+ subset << term
41
+ end
42
+ end
43
+
44
+ self.class.new(subset)
45
+ end
46
+
47
+ end
48
+ end
@@ -0,0 +1,81 @@
1
+ require "test/unit"
2
+ require "imatch"
3
+
4
+ class InvalidLexiconError < Exception; end;
5
+
6
+ class TestIMatch < Test::Unit::TestCase
7
+
8
+ def test_defines_imatch_class
9
+ assert IMatch
10
+ assert IMatch.kind_of?(Class)
11
+ end
12
+
13
+ def test_initalize_with_no_args_loads_the_default_lexicon
14
+ imatch = IMatch.new
15
+ assert imatch
16
+ assert imatch.lexicon, "expected a lexicon"
17
+ assert imatch.lexicon.size > 0, "Didn't expect a blank lexicon"
18
+ end
19
+
20
+ def test_nil_input_creates_nil_output
21
+ assert_nil IMatch.new.signature(nil)
22
+ end
23
+
24
+ def test_known_imatch_score
25
+ signature = IMatch.new.signature('foo bar')
26
+ assert signature.kind_of?(String)
27
+ assert_equal '60518c1c11dc0452be71a7118a43ab68e3451b82', signature
28
+ end
29
+
30
+ def test_imatch_consistent
31
+ assert_equal IMatch.new.signature('foo bar'), IMatch.new.signature('foo bar')
32
+ end
33
+
34
+ def test_imatch_unordered
35
+ assert_equal IMatch.new.signature('foo bar'), IMatch.new.signature('bar foo')
36
+ end
37
+
38
+ def test_imatch_simple_plurals_if_stemming_enabled
39
+ imatch_stemming = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :stemming => true)
40
+ imatch_non_stemming = IMatch.new
41
+ assert_equal imatch_stemming.signature('follower'), imatch_stemming.signature('followers'), "Failed to stem when enabled"
42
+ assert_not_equal imatch_non_stemming.signature('follower'), imatch_non_stemming.signature('followers')
43
+ end
44
+
45
+ def test_stop_words_skipped
46
+ imatch = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :stop_words => ['a'])
47
+ assert_nil imatch.signature("a")
48
+ assert_equal imatch.signature("foo"), imatch.signature("a foo")
49
+ end
50
+
51
+ def test_skipping_unknown_terms
52
+ imatch = IMatch.new
53
+ assert !imatch.lexicon.include?('{{example}}')
54
+ assert_nil imatch.signature('{{example}}')
55
+ assert_equal imatch.signature("string"), imatch.signature("{{example}} string")
56
+ end
57
+
58
+ def test_alternate_splitting
59
+ assert_equal IMatch.new.signature('F 16'), IMatch.new.signature('F-16', /\W+/)
60
+ end
61
+
62
+ def test_to_s
63
+ imatch = IMatch.new
64
+ str = imatch.to_s
65
+ assert str.include?("stemming=\"false\"")
66
+ assert str.include?("stop_word_count=\"0\"")
67
+ assert str.include?(imatch.lexicon.to_s)
68
+ end
69
+
70
+ def test_multiple_lexicon_signatures
71
+ string = "this is a test"
72
+ imatch = IMatch.new(IMatch::DEFAULT_LEXICON_FILE, :lexicons => 5)
73
+
74
+ default = imatch.signature(string)
75
+ signatures = imatch.multiple_signatures(string)
76
+
77
+ assert signatures.kind_of?(Set)
78
+ assert !signatures.empty?
79
+ assert signatures.include?(default)
80
+ end
81
+ end
@@ -0,0 +1,76 @@
1
+ require "test/unit"
2
+ require "imatch"
3
+
4
+ class TestIMatchLexicon < Test::Unit::TestCase
5
+
6
+ def test_defines_lexicon_class
7
+ assert IMatch::Lexicon
8
+ assert IMatch::Lexicon.kind_of?(Class)
9
+ end
10
+
11
+ def test_nil_file_raises_error
12
+ assert_raise InvalidLexiconError do
13
+ IMatch::Lexicon.new(nil)
14
+ end
15
+ end
16
+
17
+ def test_missing_file_raises_error
18
+ assert_raise InvalidLexiconError do
19
+ IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'not_such_file'))
20
+ end
21
+ end
22
+
23
+ def test_empty_file_raises_error
24
+ assert_raise InvalidLexiconError do
25
+ IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'empty.dat'))
26
+ end
27
+ end
28
+
29
+ def test_lexicon_size
30
+ lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
31
+ assert_equal 10, lexicon.size
32
+ end
33
+
34
+ def test_lexicon_duplicates_ignored
35
+ lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'duplicates.dat'))
36
+ assert_equal 7, lexicon.size
37
+ end
38
+
39
+ def test_lexicon_include
40
+ lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
41
+ %w(this file has ten terms in the lexicon for testing).each do |term|
42
+ assert lexicon.include?(term), "Lexicon did not include test term: #{term}"
43
+ end
44
+ end
45
+
46
+ def test_to_s
47
+ filename = File.expand_path(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
48
+ lexicon = IMatch::Lexicon.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
49
+ assert_match(/#{filename}/, lexicon.to_s)
50
+ assert_match(/#{lexicon.size}/, lexicon.to_s)
51
+ end
52
+
53
+ def test_new_with_file_argument
54
+ file = File.new(File.join(File.dirname(__FILE__), 'lexicons', 'ten.dat'))
55
+ lexicon = IMatch::Lexicon.new(file)
56
+ assert_equal 10, lexicon.size
57
+ end
58
+
59
+ def test_new_with_set_argument
60
+ lexicon = IMatch::Lexicon.new(%w(a b c d).to_set)
61
+ assert_equal 4, lexicon.size
62
+ assert_match(/N\/A/, lexicon.to_s)
63
+ end
64
+
65
+ def test_random_subset
66
+ lexicon = IMatch::Lexicon.new(IMatch::DEFAULT_LEXICON_FILE)
67
+ assert lexicon.size > 10000, "Default lexicon is too small for this test"
68
+
69
+ subset = lexicon.subset(0.5)
70
+ portion = (subset.size.to_f / lexicon.size.to_f).to_f
71
+
72
+ assert portion > 0.4, "A 50% subset should be >40% of the size or else random is not working (#{portion})"
73
+ assert portion < 0.6, "A 50% subset should be <60% of the size or else random is not working (#{portion})"
74
+ end
75
+
76
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: imatch
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Matt Sanford
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-09-01 00:00:00 -07:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: stemmer
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ hash: 21
30
+ segments:
31
+ - 1
32
+ - 0
33
+ - 1
34
+ version: 1.0.1
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: rubyforge
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 7
46
+ segments:
47
+ - 2
48
+ - 0
49
+ - 4
50
+ version: 2.0.4
51
+ type: :development
52
+ version_requirements: *id002
53
+ - !ruby/object:Gem::Dependency
54
+ name: hoe
55
+ prerelease: false
56
+ requirement: &id003 !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ hash: 21
62
+ segments:
63
+ - 2
64
+ - 6
65
+ - 1
66
+ version: 2.6.1
67
+ type: :development
68
+ version_requirements: *id003
69
+ description: An implementation of the IMatch algorithm as described at http://www.ir.iit.edu/~abdur/Research/Duplicate.html
70
+ email:
71
+ - matt@twitter.com
72
+ executables: []
73
+
74
+ extensions: []
75
+
76
+ extra_rdoc_files:
77
+ - History.txt
78
+ - Manifest.txt
79
+ - README.txt
80
+ files:
81
+ - .autotest
82
+ - History.txt
83
+ - Manifest.txt
84
+ - README.txt
85
+ - Rakefile
86
+ - lib/imatch.rb
87
+ - lib/lexicon.rb
88
+ - lib/data/en.dat
89
+ - test/test_imatch.rb
90
+ - test/test_lexicon.rb
91
+ has_rdoc: true
92
+ homepage: http://twitter.com/mzsanford
93
+ licenses: []
94
+
95
+ post_install_message:
96
+ rdoc_options:
97
+ - --main
98
+ - README.txt
99
+ require_paths:
100
+ - lib
101
+ required_ruby_version: !ruby/object:Gem::Requirement
102
+ none: false
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ hash: 3
107
+ segments:
108
+ - 0
109
+ version: "0"
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ hash: 3
116
+ segments:
117
+ - 0
118
+ version: "0"
119
+ requirements: []
120
+
121
+ rubyforge_project: imatch
122
+ rubygems_version: 1.3.7
123
+ signing_key:
124
+ specification_version: 3
125
+ summary: An implementation of the IMatch algorithm as described at http://www.ir.iit.edu/~abdur/Research/Duplicate.html
126
+ test_files:
127
+ - test/test_imatch.rb
128
+ - test/test_lexicon.rb