my_segments 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +46 -0
- data/Rakefile +1 -0
- data/lib/my_segments/.candidates.rb.swp +0 -0
- data/lib/my_segments/candidates.rb +77 -0
- data/lib/my_segments/segments.rb +1 -0
- data/lib/my_segments/substring_rules.rb +125 -0
- data/lib/my_segments/tester.rb +3 -0
- data/lib/my_segments/version.rb +3 -0
- data/lib/my_segments.rb +54 -0
- data/my_segments.gemspec +21 -0
- data/test/test_substring_rules.rb +11 -0
- metadata +76 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Jason Soo
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# Segments
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'segments'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install segments
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
Example:
|
22
|
+
|
23
|
+
```
|
24
|
+
require 'segments'
|
25
|
+
|
26
|
+
# Setup the lexicon
|
27
|
+
db_path = '/tmp/foo.sqlite3'
|
28
|
+
table_name = 'lexicon'
|
29
|
+
lex = SegmentsLexicon.new(db_path, table_name)
|
30
|
+
|
31
|
+
# Setup segments
|
32
|
+
s = Segments.new(lex)
|
33
|
+
|
34
|
+
# Search
|
35
|
+
candidates = s.suggest('telepone')
|
36
|
+
puts candidates.to_s
|
37
|
+
```
|
38
|
+
|
39
|
+
The sqlite3 table should have at least a "word" and "id" table, where word is a unique word in the lexicon, and id is a primary key.
|
40
|
+
## Contributing
|
41
|
+
|
42
|
+
1. Fork it
|
43
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
44
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
45
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
46
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
Binary file
|
@@ -0,0 +1,77 @@
|
|
1
|
+
class Candidates
|
2
|
+
|
3
|
+
attr_accessor :candidates
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@candidates = Array.new # An array of candidate objects
|
7
|
+
self
|
8
|
+
end
|
9
|
+
|
10
|
+
def prune
|
11
|
+
@candidates.delete_if { |x| x.votes < 1 }
|
12
|
+
end
|
13
|
+
|
14
|
+
def add(candidate)
|
15
|
+
@candidates << candidate
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
# Removes the candidate at the specified index
|
20
|
+
def remove(index)
|
21
|
+
@candidates.delete_at(index)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns whether or not the array contains a candidate with this
|
25
|
+
# solution id
|
26
|
+
def has_id?(id)
|
27
|
+
@candidates.collect(&:id).include?(id)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Increments the votes of the given candidate by the value.
|
31
|
+
# Returns the affected candidate object
|
32
|
+
def vote_for(id, value)
|
33
|
+
candidate = @candidates[@candidates.collect(&:id).index(id)]
|
34
|
+
candidate.votes += value
|
35
|
+
candidate
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns an array of candidates, sorted by their rank
|
39
|
+
def sort_by_rank
|
40
|
+
sorted = @candidates.sort{ |x,y| y.votes <=> x.votes }
|
41
|
+
@candidates
|
42
|
+
end
|
43
|
+
|
44
|
+
# Returns the total number of votes
|
45
|
+
def total_votes
|
46
|
+
@candidates.inject(0) { |sum, c| sum + c.votes }
|
47
|
+
end
|
48
|
+
|
49
|
+
# Returns the number of candidates
|
50
|
+
def size
|
51
|
+
@candidates.size
|
52
|
+
end
|
53
|
+
|
54
|
+
# Pretty prints the array of candidates
|
55
|
+
def to_s
|
56
|
+
s = ''
|
57
|
+
unless @candidates == nil
|
58
|
+
@candidates.each do |c|
|
59
|
+
s += "[#{c.id}, #{c.misspelled}, #{c.solution}, #{c.votes}]\n"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
return s
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class Candidate < Candidates
|
67
|
+
|
68
|
+
attr_accessor :id, :misspelled, :solution, :votes
|
69
|
+
|
70
|
+
def initialize(misspelled, solution, id, votes = 0.0)
|
71
|
+
@misspelled = misspelled
|
72
|
+
@solution = solution
|
73
|
+
@id = id
|
74
|
+
@votes = votes
|
75
|
+
self
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,125 @@
|
|
1
|
+
|
2
|
+
class SubstringRules
|
3
|
+
|
4
|
+
def initialize(lexicon)
|
5
|
+
@candidates = Candidates.new
|
6
|
+
@lex = lexicon
|
7
|
+
end
|
8
|
+
|
9
|
+
# Generates substring rules (segments) for the query term
|
10
|
+
# and returns them.
|
11
|
+
def self.for(query_term)
|
12
|
+
segs = []
|
13
|
+
segs << self.method_1(query_term)
|
14
|
+
segs << self.method_3(query_term)
|
15
|
+
segs << self.method_4(query_term)
|
16
|
+
segs << self.method_5(query_term)
|
17
|
+
segs << self.method_6(query_term)
|
18
|
+
segs << self.method_7(query_term)
|
19
|
+
segs.flatten
|
20
|
+
end
|
21
|
+
|
22
|
+
def find(query)
|
23
|
+
@misspelled = query
|
24
|
+
|
25
|
+
# First look for an exact match, and return if one is found
|
26
|
+
find_candidates([query], true)
|
27
|
+
if @candidates.size > 0
|
28
|
+
return @candidates
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# This function cuts off one letter at a time from the start and end of the search term...
|
33
|
+
# It then re-searches using the new term. It continues to do so until the ET is reached,
|
34
|
+
# Or the term has become too small to cut off more letters.
|
35
|
+
# Example:
|
36
|
+
# %Slovakia%
|
37
|
+
# %lovaki%
|
38
|
+
# %ovak%
|
39
|
+
# etc
|
40
|
+
def self.method_1(query)
|
41
|
+
q = String.new(query)
|
42
|
+
@substring_rules = []
|
43
|
+
while q.size > 3 do
|
44
|
+
q = q[1..-2]
|
45
|
+
@substring_rules << "%" + q + "%"
|
46
|
+
end
|
47
|
+
@substring_rules
|
48
|
+
end
|
49
|
+
|
50
|
+
# This function replaces the middle of the search term with %'s
|
51
|
+
# MySQL views %'s "match anything". The function then re-searches
|
52
|
+
# The database using the new query until either the ET is reached,
|
53
|
+
# Or until the query is too short to continue dividing.
|
54
|
+
# Example:
|
55
|
+
# %Slovakia%
|
56
|
+
# %Slov%kia%
|
57
|
+
# %Slo%ia%
|
58
|
+
# etc
|
59
|
+
def self.method_3(query)
|
60
|
+
q = String.new(query)
|
61
|
+
@substring_rules = []
|
62
|
+
@length = q.length
|
63
|
+
while @length > 3 do
|
64
|
+
q.gsub!('%', '')
|
65
|
+
q[@length/2] = '%'
|
66
|
+
@length = q.length
|
67
|
+
@substring_rules << String.new(q)
|
68
|
+
end
|
69
|
+
@substring_rules
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
# This function divides the query in 1/2 and cuts off the front 1/2.
|
74
|
+
# It only adds %'s to the BEGINING of the word.
|
75
|
+
# Exmaple:
|
76
|
+
# %Slovakia%
|
77
|
+
# %akia
|
78
|
+
def self.method_4(query)
|
79
|
+
if query.length == 1
|
80
|
+
return [query]
|
81
|
+
else
|
82
|
+
query = ["%" + query[(query.length/2)..-1]]
|
83
|
+
return query
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
# Same as above function, but keeps the latter 1/2 of the query.
|
89
|
+
# However, a percent SHOULD be put at the end of the query and NOT
|
90
|
+
# at the begining of the query.
|
91
|
+
# Example:
|
92
|
+
# %Slovakia%
|
93
|
+
# Slov%
|
94
|
+
def self.method_5(query)
|
95
|
+
if query.length == 1
|
96
|
+
return [query]
|
97
|
+
else
|
98
|
+
return [query[0..(query.length/2)-1] + "%"]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
# This function cuts everything out of the middle of the query...
|
104
|
+
# Only leaving the first and last letters. It replaces the
|
105
|
+
# chars in the middle of the query wiht a %.
|
106
|
+
# Example:
|
107
|
+
# Slovakia
|
108
|
+
# S%a
|
109
|
+
def self.method_6(query)
|
110
|
+
query = [query[0].chr + "%" + query[-1].chr]
|
111
|
+
end
|
112
|
+
|
113
|
+
# Same as above, but it keeps the last two AND first two
|
114
|
+
# chars of the query.
|
115
|
+
# Example:
|
116
|
+
# Slovakia
|
117
|
+
# Sl%ia
|
118
|
+
def self.method_7(query)
|
119
|
+
if query.length == 1
|
120
|
+
return [query]
|
121
|
+
else
|
122
|
+
return [query[0..1] + "%" + query[-2..-1]]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
data/lib/my_segments.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'segments_lexicon'
|
2
|
+
require "my_segments/version"
|
3
|
+
require "my_segments/candidates"
|
4
|
+
require 'my_segments/substring_rules'
|
5
|
+
|
6
|
+
class Segments
|
7
|
+
# Initializes with a segments lexicon
|
8
|
+
def initialize(segments_lexicon)
|
9
|
+
@lex = segments_lexicon
|
10
|
+
end
|
11
|
+
|
12
|
+
# Returns the query_term broken down into
|
13
|
+
# the substring rules (segments) we'll use
|
14
|
+
# for searching
|
15
|
+
def for_term(query_term)
|
16
|
+
SubstringRules.for(query_term)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Takes a query_term, generates the segments
|
20
|
+
# for that term, and searches for those segment
|
21
|
+
# matches in the lexicon.
|
22
|
+
#
|
23
|
+
# Returns a ranked ordered list of candidates
|
24
|
+
def suggest(query_term)
|
25
|
+
@qt = query_term
|
26
|
+
@candidates = Candidates.new
|
27
|
+
srs = for_term(@qt)
|
28
|
+
|
29
|
+
srs.each do |seg|
|
30
|
+
@lex.search(seg).each do |result|
|
31
|
+
found(result)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Run substring rules
|
36
|
+
# Check confidence
|
37
|
+
# Run ngrams
|
38
|
+
# Return most confident candidate set
|
39
|
+
return @candidates
|
40
|
+
end
|
41
|
+
|
42
|
+
private #------
|
43
|
+
|
44
|
+
def found(result)
|
45
|
+
word = result["word"]
|
46
|
+
id = result["id"]
|
47
|
+
if @candidates.has_id?(id)
|
48
|
+
@candidates.vote_for(id, 1.0)
|
49
|
+
else
|
50
|
+
c = Candidate.new(@qt, word, id)
|
51
|
+
@candidates.add(c)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/my_segments.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'segments/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "my_segments"
|
8
|
+
gem.version = Segments::VERSION
|
9
|
+
gem.authors = ["Jason Soo"]
|
10
|
+
gem.email = ["wwwjscom@gmail.com"]
|
11
|
+
gem.description = %q{Segments gem}
|
12
|
+
gem.summary = %q{Segments gem}
|
13
|
+
gem.homepage = ""
|
14
|
+
|
15
|
+
gem.add_runtime_dependency "segments_lexicon"
|
16
|
+
|
17
|
+
gem.files = `git ls-files`.split($/)
|
18
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
19
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
20
|
+
gem.require_paths = ["lib"]
|
21
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require_relative '../lib/segments/substring_rules'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
class TestSubstringRules < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_substrings
|
7
|
+
segs = SubstringRules.for('telephone')
|
8
|
+
assert_instance_of(Array, segs)
|
9
|
+
assert_equal(14, segs.size)
|
10
|
+
end
|
11
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: my_segments
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jason Soo
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-10-14 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: segments_lexicon
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: Segments gem
|
31
|
+
email:
|
32
|
+
- wwwjscom@gmail.com
|
33
|
+
executables: []
|
34
|
+
extensions: []
|
35
|
+
extra_rdoc_files: []
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- Gemfile
|
39
|
+
- LICENSE.txt
|
40
|
+
- README.md
|
41
|
+
- Rakefile
|
42
|
+
- lib/my_segments.rb
|
43
|
+
- lib/my_segments/.candidates.rb.swp
|
44
|
+
- lib/my_segments/candidates.rb
|
45
|
+
- lib/my_segments/segments.rb
|
46
|
+
- lib/my_segments/substring_rules.rb
|
47
|
+
- lib/my_segments/tester.rb
|
48
|
+
- lib/my_segments/version.rb
|
49
|
+
- my_segments.gemspec
|
50
|
+
- test/test_substring_rules.rb
|
51
|
+
homepage: ''
|
52
|
+
licenses: []
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
requirements: []
|
70
|
+
rubyforge_project:
|
71
|
+
rubygems_version: 1.8.24
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Segments gem
|
75
|
+
test_files:
|
76
|
+
- test/test_substring_rules.rb
|