my_segments 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in segments.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jason Soo
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # Segments
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'segments'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install segments
18
+
19
+ ## Usage
20
+
21
+ Example:
22
+
23
+ ```
24
+ require 'segments'
25
+
26
+ # Setup the lexicon
27
+ db_path = '/tmp/foo.sqlite3'
28
+ table_name = 'lexicon'
29
+ lex = SegmentsLexicon.new(db_path, table_name)
30
+
31
+ # Setup segments
32
+ s = Segments.new(lex)
33
+
34
+ # Search
35
+ candidates = s.suggest('telepone')
36
+ puts candidates.to_s
37
+ ```
38
+
39
+ The sqlite3 table should have at least a "word" and "id" table, where word is a unique word in the lexicon, and id is a primary key.
40
+ ## Contributing
41
+
42
+ 1. Fork it
43
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
44
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
45
+ 4. Push to the branch (`git push origin my-new-feature`)
46
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
Binary file
@@ -0,0 +1,77 @@
1
+ class Candidates
2
+
3
+ attr_accessor :candidates
4
+
5
+ def initialize
6
+ @candidates = Array.new # An array of candidate objects
7
+ self
8
+ end
9
+
10
+ def prune
11
+ @candidates.delete_if { |x| x.votes < 1 }
12
+ end
13
+
14
+ def add(candidate)
15
+ @candidates << candidate
16
+ self
17
+ end
18
+
19
+ # Removes the candidate at the specified index
20
+ def remove(index)
21
+ @candidates.delete_at(index)
22
+ end
23
+
24
+ # Returns whether or not the array contains a candidate with this
25
+ # solution id
26
+ def has_id?(id)
27
+ @candidates.collect(&:id).include?(id)
28
+ end
29
+
30
+ # Increments the votes of the given candidate by the value.
31
+ # Returns the affected candidate object
32
+ def vote_for(id, value)
33
+ candidate = @candidates[@candidates.collect(&:id).index(id)]
34
+ candidate.votes += value
35
+ candidate
36
+ end
37
+
38
+ # Returns an array of candidates, sorted by their rank
39
+ def sort_by_rank
40
+ sorted = @candidates.sort{ |x,y| y.votes <=> x.votes }
41
+ @candidates
42
+ end
43
+
44
+ # Returns the total number of votes
45
+ def total_votes
46
+ @candidates.inject(0) { |sum, c| sum + c.votes }
47
+ end
48
+
49
+ # Returns the number of candidates
50
+ def size
51
+ @candidates.size
52
+ end
53
+
54
+ # Pretty prints the array of candidates
55
+ def to_s
56
+ s = ''
57
+ unless @candidates == nil
58
+ @candidates.each do |c|
59
+ s += "[#{c.id}, #{c.misspelled}, #{c.solution}, #{c.votes}]\n"
60
+ end
61
+ end
62
+ return s
63
+ end
64
+ end
65
+
66
+ class Candidate < Candidates
67
+
68
+ attr_accessor :id, :misspelled, :solution, :votes
69
+
70
+ def initialize(misspelled, solution, id, votes = 0.0)
71
+ @misspelled = misspelled
72
+ @solution = solution
73
+ @id = id
74
+ @votes = votes
75
+ self
76
+ end
77
+ end
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,125 @@
1
+
2
+ class SubstringRules
3
+
4
+ def initialize(lexicon)
5
+ @candidates = Candidates.new
6
+ @lex = lexicon
7
+ end
8
+
9
+ # Generates substring rules (segments) for the query term
10
+ # and returns them.
11
+ def self.for(query_term)
12
+ segs = []
13
+ segs << self.method_1(query_term)
14
+ segs << self.method_3(query_term)
15
+ segs << self.method_4(query_term)
16
+ segs << self.method_5(query_term)
17
+ segs << self.method_6(query_term)
18
+ segs << self.method_7(query_term)
19
+ segs.flatten
20
+ end
21
+
22
+ def find(query)
23
+ @misspelled = query
24
+
25
+ # First look for an exact match, and return if one is found
26
+ find_candidates([query], true)
27
+ if @candidates.size > 0
28
+ return @candidates
29
+ end
30
+ end
31
+
32
+ # This function cuts off one letter at a time from the start and end of the search term...
33
+ # It then re-searches using the new term. It continues to do so until the ET is reached,
34
+ # Or the term has become too small to cut off more letters.
35
+ # Example:
36
+ # %Slovakia%
37
+ # %lovaki%
38
+ # %ovak%
39
+ # etc
40
+ def self.method_1(query)
41
+ q = String.new(query)
42
+ @substring_rules = []
43
+ while q.size > 3 do
44
+ q = q[1..-2]
45
+ @substring_rules << "%" + q + "%"
46
+ end
47
+ @substring_rules
48
+ end
49
+
50
+ # This function replaces the middle of the search term with %'s
51
+ # MySQL views %'s "match anything". The function then re-searches
52
+ # The database using the new query until either the ET is reached,
53
+ # Or until the query is too short to continue dividing.
54
+ # Example:
55
+ # %Slovakia%
56
+ # %Slov%kia%
57
+ # %Slo%ia%
58
+ # etc
59
+ def self.method_3(query)
60
+ q = String.new(query)
61
+ @substring_rules = []
62
+ @length = q.length
63
+ while @length > 3 do
64
+ q.gsub!('%', '')
65
+ q[@length/2] = '%'
66
+ @length = q.length
67
+ @substring_rules << String.new(q)
68
+ end
69
+ @substring_rules
70
+ end
71
+
72
+
73
+ # This function divides the query in 1/2 and cuts off the front 1/2.
74
+ # It only adds %'s to the BEGINING of the word.
75
+ # Exmaple:
76
+ # %Slovakia%
77
+ # %akia
78
+ def self.method_4(query)
79
+ if query.length == 1
80
+ return [query]
81
+ else
82
+ query = ["%" + query[(query.length/2)..-1]]
83
+ return query
84
+ end
85
+ end
86
+
87
+
88
+ # Same as above function, but keeps the latter 1/2 of the query.
89
+ # However, a percent SHOULD be put at the end of the query and NOT
90
+ # at the begining of the query.
91
+ # Example:
92
+ # %Slovakia%
93
+ # Slov%
94
+ def self.method_5(query)
95
+ if query.length == 1
96
+ return [query]
97
+ else
98
+ return [query[0..(query.length/2)-1] + "%"]
99
+ end
100
+ end
101
+
102
+
103
+ # This function cuts everything out of the middle of the query...
104
+ # Only leaving the first and last letters. It replaces the
105
+ # chars in the middle of the query wiht a %.
106
+ # Example:
107
+ # Slovakia
108
+ # S%a
109
+ def self.method_6(query)
110
+ query = [query[0].chr + "%" + query[-1].chr]
111
+ end
112
+
113
+ # Same as above, but it keeps the last two AND first two
114
+ # chars of the query.
115
+ # Example:
116
+ # Slovakia
117
+ # Sl%ia
118
+ def self.method_7(query)
119
+ if query.length == 1
120
+ return [query]
121
+ else
122
+ return [query[0..1] + "%" + query[-2..-1]]
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,3 @@
1
+ require_relative 'segments'
2
+ segments = Segments.new
3
+ puts segments.suggest('hummer')
@@ -0,0 +1,3 @@
1
+ class Segments
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'segments_lexicon'
2
+ require "my_segments/version"
3
+ require "my_segments/candidates"
4
+ require 'my_segments/substring_rules'
5
+
6
+ class Segments
7
+ # Initializes with a segments lexicon
8
+ def initialize(segments_lexicon)
9
+ @lex = segments_lexicon
10
+ end
11
+
12
+ # Returns the query_term broken down into
13
+ # the substring rules (segments) we'll use
14
+ # for searching
15
+ def for_term(query_term)
16
+ SubstringRules.for(query_term)
17
+ end
18
+
19
+ # Takes a query_term, generates the segments
20
+ # for that term, and searches for those segment
21
+ # matches in the lexicon.
22
+ #
23
+ # Returns a ranked ordered list of candidates
24
+ def suggest(query_term)
25
+ @qt = query_term
26
+ @candidates = Candidates.new
27
+ srs = for_term(@qt)
28
+
29
+ srs.each do |seg|
30
+ @lex.search(seg).each do |result|
31
+ found(result)
32
+ end
33
+ end
34
+
35
+ # Run substring rules
36
+ # Check confidence
37
+ # Run ngrams
38
+ # Return most confident candidate set
39
+ return @candidates
40
+ end
41
+
42
+ private #------
43
+
44
+ def found(result)
45
+ word = result["word"]
46
+ id = result["id"]
47
+ if @candidates.has_id?(id)
48
+ @candidates.vote_for(id, 1.0)
49
+ else
50
+ c = Candidate.new(@qt, word, id)
51
+ @candidates.add(c)
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'segments/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "my_segments"
8
+ gem.version = Segments::VERSION
9
+ gem.authors = ["Jason Soo"]
10
+ gem.email = ["wwwjscom@gmail.com"]
11
+ gem.description = %q{Segments gem}
12
+ gem.summary = %q{Segments gem}
13
+ gem.homepage = ""
14
+
15
+ gem.add_runtime_dependency "segments_lexicon"
16
+
17
+ gem.files = `git ls-files`.split($/)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib"]
21
+ end
@@ -0,0 +1,11 @@
1
+ require_relative '../lib/segments/substring_rules'
2
+ require 'test/unit'
3
+
4
+ class TestSubstringRules < Test::Unit::TestCase
5
+
6
+ def test_substrings
7
+ segs = SubstringRules.for('telephone')
8
+ assert_instance_of(Array, segs)
9
+ assert_equal(14, segs.size)
10
+ end
11
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: my_segments
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jason Soo
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-10-14 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: segments_lexicon
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: Segments gem
31
+ email:
32
+ - wwwjscom@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - .gitignore
38
+ - Gemfile
39
+ - LICENSE.txt
40
+ - README.md
41
+ - Rakefile
42
+ - lib/my_segments.rb
43
+ - lib/my_segments/.candidates.rb.swp
44
+ - lib/my_segments/candidates.rb
45
+ - lib/my_segments/segments.rb
46
+ - lib/my_segments/substring_rules.rb
47
+ - lib/my_segments/tester.rb
48
+ - lib/my_segments/version.rb
49
+ - my_segments.gemspec
50
+ - test/test_substring_rules.rb
51
+ homepage: ''
52
+ licenses: []
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubyforge_project:
71
+ rubygems_version: 1.8.24
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Segments gem
75
+ test_files:
76
+ - test/test_substring_rules.rb