my_segments 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in segments.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jason Soo
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # Segments
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'segments'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install segments
18
+
19
+ ## Usage
20
+
21
+ Example:
22
+
23
+ ```
24
+ require 'segments'
25
+
26
+ # Setup the lexicon
27
+ db_path = '/tmp/foo.sqlite3'
28
+ table_name = 'lexicon'
29
+ lex = SegmentsLexicon.new(db_path, table_name)
30
+
31
+ # Setup segments
32
+ s = Segments.new(lex)
33
+
34
+ # Search
35
+ candidates = s.suggest('telepone')
36
+ puts candidates.to_s
37
+ ```
38
+
39
+ The sqlite3 table should have at least a "word" and "id" table, where word is a unique word in the lexicon, and id is a primary key.
40
+ ## Contributing
41
+
42
+ 1. Fork it
43
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
44
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
45
+ 4. Push to the branch (`git push origin my-new-feature`)
46
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
Binary file
@@ -0,0 +1,77 @@
1
+ class Candidates
2
+
3
+ attr_accessor :candidates
4
+
5
+ def initialize
6
+ @candidates = Array.new # An array of candidate objects
7
+ self
8
+ end
9
+
10
+ def prune
11
+ @candidates.delete_if { |x| x.votes < 1 }
12
+ end
13
+
14
+ def add(candidate)
15
+ @candidates << candidate
16
+ self
17
+ end
18
+
19
+ # Removes the candidate at the specified index
20
+ def remove(index)
21
+ @candidates.delete_at(index)
22
+ end
23
+
24
+ # Returns whether or not the array contains a candidate with this
25
+ # solution id
26
+ def has_id?(id)
27
+ @candidates.collect(&:id).include?(id)
28
+ end
29
+
30
+ # Increments the votes of the given candidate by the value.
31
+ # Returns the affected candidate object
32
+ def vote_for(id, value)
33
+ candidate = @candidates[@candidates.collect(&:id).index(id)]
34
+ candidate.votes += value
35
+ candidate
36
+ end
37
+
38
+ # Returns an array of candidates, sorted by their rank
39
+ def sort_by_rank
40
+ sorted = @candidates.sort{ |x,y| y.votes <=> x.votes }
41
+ @candidates
42
+ end
43
+
44
+ # Returns the total number of votes
45
+ def total_votes
46
+ @candidates.inject(0) { |sum, c| sum + c.votes }
47
+ end
48
+
49
+ # Returns the number of candidates
50
+ def size
51
+ @candidates.size
52
+ end
53
+
54
+ # Pretty prints the array of candidates
55
+ def to_s
56
+ s = ''
57
+ unless @candidates == nil
58
+ @candidates.each do |c|
59
+ s += "[#{c.id}, #{c.misspelled}, #{c.solution}, #{c.votes}]\n"
60
+ end
61
+ end
62
+ return s
63
+ end
64
+ end
65
+
66
+ class Candidate < Candidates
67
+
68
+ attr_accessor :id, :misspelled, :solution, :votes
69
+
70
+ def initialize(misspelled, solution, id, votes = 0.0)
71
+ @misspelled = misspelled
72
+ @solution = solution
73
+ @id = id
74
+ @votes = votes
75
+ self
76
+ end
77
+ end
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,125 @@
1
+
2
+ class SubstringRules
3
+
4
+ def initialize(lexicon)
5
+ @candidates = Candidates.new
6
+ @lex = lexicon
7
+ end
8
+
9
+ # Generates substring rules (segments) for the query term
10
+ # and returns them.
11
+ def self.for(query_term)
12
+ segs = []
13
+ segs << self.method_1(query_term)
14
+ segs << self.method_3(query_term)
15
+ segs << self.method_4(query_term)
16
+ segs << self.method_5(query_term)
17
+ segs << self.method_6(query_term)
18
+ segs << self.method_7(query_term)
19
+ segs.flatten
20
+ end
21
+
22
+ def find(query)
23
+ @misspelled = query
24
+
25
+ # First look for an exact match, and return if one is found
26
+ find_candidates([query], true)
27
+ if @candidates.size > 0
28
+ return @candidates
29
+ end
30
+ end
31
+
32
+ # This function cuts off one letter at a time from the start and end of the search term...
33
+ # It then re-searches using the new term. It continues to do so until the ET is reached,
34
+ # Or the term has become too small to cut off more letters.
35
+ # Example:
36
+ # %Slovakia%
37
+ # %lovaki%
38
+ # %ovak%
39
+ # etc
40
+ def self.method_1(query)
41
+ q = String.new(query)
42
+ @substring_rules = []
43
+ while q.size > 3 do
44
+ q = q[1..-2]
45
+ @substring_rules << "%" + q + "%"
46
+ end
47
+ @substring_rules
48
+ end
49
+
50
+ # This function replaces the middle of the search term with %'s
51
+ # MySQL views %'s "match anything". The function then re-searches
52
+ # The database using the new query until either the ET is reached,
53
+ # Or until the query is too short to continue dividing.
54
+ # Example:
55
+ # %Slovakia%
56
+ # %Slov%kia%
57
+ # %Slo%ia%
58
+ # etc
59
+ def self.method_3(query)
60
+ q = String.new(query)
61
+ @substring_rules = []
62
+ @length = q.length
63
+ while @length > 3 do
64
+ q.gsub!('%', '')
65
+ q[@length/2] = '%'
66
+ @length = q.length
67
+ @substring_rules << String.new(q)
68
+ end
69
+ @substring_rules
70
+ end
71
+
72
+
73
+ # This function divides the query in 1/2 and cuts off the front 1/2.
74
+ # It only adds %'s to the BEGINING of the word.
75
+ # Exmaple:
76
+ # %Slovakia%
77
+ # %akia
78
+ def self.method_4(query)
79
+ if query.length == 1
80
+ return [query]
81
+ else
82
+ query = ["%" + query[(query.length/2)..-1]]
83
+ return query
84
+ end
85
+ end
86
+
87
+
88
+ # Same as above function, but keeps the latter 1/2 of the query.
89
+ # However, a percent SHOULD be put at the end of the query and NOT
90
+ # at the begining of the query.
91
+ # Example:
92
+ # %Slovakia%
93
+ # Slov%
94
+ def self.method_5(query)
95
+ if query.length == 1
96
+ return [query]
97
+ else
98
+ return [query[0..(query.length/2)-1] + "%"]
99
+ end
100
+ end
101
+
102
+
103
+ # This function cuts everything out of the middle of the query...
104
+ # Only leaving the first and last letters. It replaces the
105
+ # chars in the middle of the query wiht a %.
106
+ # Example:
107
+ # Slovakia
108
+ # S%a
109
+ def self.method_6(query)
110
+ query = [query[0].chr + "%" + query[-1].chr]
111
+ end
112
+
113
+ # Same as above, but it keeps the last two AND first two
114
+ # chars of the query.
115
+ # Example:
116
+ # Slovakia
117
+ # Sl%ia
118
+ def self.method_7(query)
119
+ if query.length == 1
120
+ return [query]
121
+ else
122
+ return [query[0..1] + "%" + query[-2..-1]]
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,3 @@
1
+ require_relative 'segments'
2
+ segments = Segments.new
3
+ puts segments.suggest('hummer')
@@ -0,0 +1,3 @@
1
+ class Segments
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'segments_lexicon'
2
+ require "my_segments/version"
3
+ require "my_segments/candidates"
4
+ require 'my_segments/substring_rules'
5
+
6
+ class Segments
7
+ # Initializes with a segments lexicon
8
+ def initialize(segments_lexicon)
9
+ @lex = segments_lexicon
10
+ end
11
+
12
+ # Returns the query_term broken down into
13
+ # the substring rules (segments) we'll use
14
+ # for searching
15
+ def for_term(query_term)
16
+ SubstringRules.for(query_term)
17
+ end
18
+
19
+ # Takes a query_term, generates the segments
20
+ # for that term, and searches for those segment
21
+ # matches in the lexicon.
22
+ #
23
+ # Returns a ranked ordered list of candidates
24
+ def suggest(query_term)
25
+ @qt = query_term
26
+ @candidates = Candidates.new
27
+ srs = for_term(@qt)
28
+
29
+ srs.each do |seg|
30
+ @lex.search(seg).each do |result|
31
+ found(result)
32
+ end
33
+ end
34
+
35
+ # Run substring rules
36
+ # Check confidence
37
+ # Run ngrams
38
+ # Return most confident candidate set
39
+ return @candidates
40
+ end
41
+
42
+ private #------
43
+
44
+ def found(result)
45
+ word = result["word"]
46
+ id = result["id"]
47
+ if @candidates.has_id?(id)
48
+ @candidates.vote_for(id, 1.0)
49
+ else
50
+ c = Candidate.new(@qt, word, id)
51
+ @candidates.add(c)
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'segments/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "my_segments"
8
+ gem.version = Segments::VERSION
9
+ gem.authors = ["Jason Soo"]
10
+ gem.email = ["wwwjscom@gmail.com"]
11
+ gem.description = %q{Segments gem}
12
+ gem.summary = %q{Segments gem}
13
+ gem.homepage = ""
14
+
15
+ gem.add_runtime_dependency "segments_lexicon"
16
+
17
+ gem.files = `git ls-files`.split($/)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib"]
21
+ end
@@ -0,0 +1,11 @@
1
+ require_relative '../lib/segments/substring_rules'
2
+ require 'test/unit'
3
+
4
+ class TestSubstringRules < Test::Unit::TestCase
5
+
6
+ def test_substrings
7
+ segs = SubstringRules.for('telephone')
8
+ assert_instance_of(Array, segs)
9
+ assert_equal(14, segs.size)
10
+ end
11
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: my_segments
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jason Soo
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-10-14 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: segments_lexicon
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: Segments gem
31
+ email:
32
+ - wwwjscom@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - .gitignore
38
+ - Gemfile
39
+ - LICENSE.txt
40
+ - README.md
41
+ - Rakefile
42
+ - lib/my_segments.rb
43
+ - lib/my_segments/.candidates.rb.swp
44
+ - lib/my_segments/candidates.rb
45
+ - lib/my_segments/segments.rb
46
+ - lib/my_segments/substring_rules.rb
47
+ - lib/my_segments/tester.rb
48
+ - lib/my_segments/version.rb
49
+ - my_segments.gemspec
50
+ - test/test_substring_rules.rb
51
+ homepage: ''
52
+ licenses: []
53
+ post_install_message:
54
+ rdoc_options: []
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubyforge_project:
71
+ rubygems_version: 1.8.24
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Segments gem
75
+ test_files:
76
+ - test/test_substring_rules.rb