my_segments 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +46 -0
- data/Rakefile +1 -0
- data/lib/my_segments/.candidates.rb.swp +0 -0
- data/lib/my_segments/candidates.rb +77 -0
- data/lib/my_segments/segments.rb +1 -0
- data/lib/my_segments/substring_rules.rb +125 -0
- data/lib/my_segments/tester.rb +3 -0
- data/lib/my_segments/version.rb +3 -0
- data/lib/my_segments.rb +54 -0
- data/my_segments.gemspec +21 -0
- data/test/test_substring_rules.rb +11 -0
- metadata +76 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Jason Soo
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# Segments
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'segments'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install segments
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
Example:
|
22
|
+
|
23
|
+
```
|
24
|
+
require 'segments'
|
25
|
+
|
26
|
+
# Setup the lexicon
|
27
|
+
db_path = '/tmp/foo.sqlite3'
|
28
|
+
table_name = 'lexicon'
|
29
|
+
lex = SegmentsLexicon.new(db_path, table_name)
|
30
|
+
|
31
|
+
# Setup segments
|
32
|
+
s = Segments.new(lex)
|
33
|
+
|
34
|
+
# Search
|
35
|
+
candidates = s.suggest('telepone')
|
36
|
+
puts candidates.to_s
|
37
|
+
```
|
38
|
+
|
39
|
+
The sqlite3 table should have at least a "word" and "id" table, where word is a unique word in the lexicon, and id is a primary key.
|
40
|
+
## Contributing
|
41
|
+
|
42
|
+
1. Fork it
|
43
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
44
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
45
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
46
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
Binary file
|
@@ -0,0 +1,77 @@
|
|
1
|
+
class Candidates
|
2
|
+
|
3
|
+
attr_accessor :candidates
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@candidates = Array.new # An array of candidate objects
|
7
|
+
self
|
8
|
+
end
|
9
|
+
|
10
|
+
def prune
|
11
|
+
@candidates.delete_if { |x| x.votes < 1 }
|
12
|
+
end
|
13
|
+
|
14
|
+
def add(candidate)
|
15
|
+
@candidates << candidate
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
# Removes the candidate at the specified index
|
20
|
+
def remove(index)
|
21
|
+
@candidates.delete_at(index)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Returns whether or not the array contains a candidate with this
|
25
|
+
# solution id
|
26
|
+
def has_id?(id)
|
27
|
+
@candidates.collect(&:id).include?(id)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Increments the votes of the given candidate by the value.
|
31
|
+
# Returns the affected candidate object
|
32
|
+
def vote_for(id, value)
|
33
|
+
candidate = @candidates[@candidates.collect(&:id).index(id)]
|
34
|
+
candidate.votes += value
|
35
|
+
candidate
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns an array of candidates, sorted by their rank
|
39
|
+
def sort_by_rank
|
40
|
+
sorted = @candidates.sort{ |x,y| y.votes <=> x.votes }
|
41
|
+
@candidates
|
42
|
+
end
|
43
|
+
|
44
|
+
# Returns the total number of votes
|
45
|
+
def total_votes
|
46
|
+
@candidates.inject(0) { |sum, c| sum + c.votes }
|
47
|
+
end
|
48
|
+
|
49
|
+
# Returns the number of candidates
|
50
|
+
def size
|
51
|
+
@candidates.size
|
52
|
+
end
|
53
|
+
|
54
|
+
# Pretty prints the array of candidates
|
55
|
+
def to_s
|
56
|
+
s = ''
|
57
|
+
unless @candidates == nil
|
58
|
+
@candidates.each do |c|
|
59
|
+
s += "[#{c.id}, #{c.misspelled}, #{c.solution}, #{c.votes}]\n"
|
60
|
+
end
|
61
|
+
end
|
62
|
+
return s
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class Candidate < Candidates
|
67
|
+
|
68
|
+
attr_accessor :id, :misspelled, :solution, :votes
|
69
|
+
|
70
|
+
def initialize(misspelled, solution, id, votes = 0.0)
|
71
|
+
@misspelled = misspelled
|
72
|
+
@solution = solution
|
73
|
+
@id = id
|
74
|
+
@votes = votes
|
75
|
+
self
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,125 @@
|
|
1
|
+
|
2
|
+
class SubstringRules
|
3
|
+
|
4
|
+
def initialize(lexicon)
|
5
|
+
@candidates = Candidates.new
|
6
|
+
@lex = lexicon
|
7
|
+
end
|
8
|
+
|
9
|
+
# Generates substring rules (segments) for the query term
|
10
|
+
# and returns them.
|
11
|
+
def self.for(query_term)
|
12
|
+
segs = []
|
13
|
+
segs << self.method_1(query_term)
|
14
|
+
segs << self.method_3(query_term)
|
15
|
+
segs << self.method_4(query_term)
|
16
|
+
segs << self.method_5(query_term)
|
17
|
+
segs << self.method_6(query_term)
|
18
|
+
segs << self.method_7(query_term)
|
19
|
+
segs.flatten
|
20
|
+
end
|
21
|
+
|
22
|
+
def find(query)
|
23
|
+
@misspelled = query
|
24
|
+
|
25
|
+
# First look for an exact match, and return if one is found
|
26
|
+
find_candidates([query], true)
|
27
|
+
if @candidates.size > 0
|
28
|
+
return @candidates
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# This function cuts off one letter at a time from the start and end of the search term...
|
33
|
+
# It then re-searches using the new term. It continues to do so until the ET is reached,
|
34
|
+
# Or the term has become too small to cut off more letters.
|
35
|
+
# Example:
|
36
|
+
# %Slovakia%
|
37
|
+
# %lovaki%
|
38
|
+
# %ovak%
|
39
|
+
# etc
|
40
|
+
def self.method_1(query)
|
41
|
+
q = String.new(query)
|
42
|
+
@substring_rules = []
|
43
|
+
while q.size > 3 do
|
44
|
+
q = q[1..-2]
|
45
|
+
@substring_rules << "%" + q + "%"
|
46
|
+
end
|
47
|
+
@substring_rules
|
48
|
+
end
|
49
|
+
|
50
|
+
# This function replaces the middle of the search term with %'s
|
51
|
+
# MySQL views %'s "match anything". The function then re-searches
|
52
|
+
# The database using the new query until either the ET is reached,
|
53
|
+
# Or until the query is too short to continue dividing.
|
54
|
+
# Example:
|
55
|
+
# %Slovakia%
|
56
|
+
# %Slov%kia%
|
57
|
+
# %Slo%ia%
|
58
|
+
# etc
|
59
|
+
def self.method_3(query)
|
60
|
+
q = String.new(query)
|
61
|
+
@substring_rules = []
|
62
|
+
@length = q.length
|
63
|
+
while @length > 3 do
|
64
|
+
q.gsub!('%', '')
|
65
|
+
q[@length/2] = '%'
|
66
|
+
@length = q.length
|
67
|
+
@substring_rules << String.new(q)
|
68
|
+
end
|
69
|
+
@substring_rules
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
# This function divides the query in 1/2 and cuts off the front 1/2.
|
74
|
+
# It only adds %'s to the BEGINING of the word.
|
75
|
+
# Exmaple:
|
76
|
+
# %Slovakia%
|
77
|
+
# %akia
|
78
|
+
def self.method_4(query)
|
79
|
+
if query.length == 1
|
80
|
+
return [query]
|
81
|
+
else
|
82
|
+
query = ["%" + query[(query.length/2)..-1]]
|
83
|
+
return query
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
# Same as above function, but keeps the latter 1/2 of the query.
|
89
|
+
# However, a percent SHOULD be put at the end of the query and NOT
|
90
|
+
# at the begining of the query.
|
91
|
+
# Example:
|
92
|
+
# %Slovakia%
|
93
|
+
# Slov%
|
94
|
+
def self.method_5(query)
|
95
|
+
if query.length == 1
|
96
|
+
return [query]
|
97
|
+
else
|
98
|
+
return [query[0..(query.length/2)-1] + "%"]
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
# This function cuts everything out of the middle of the query...
|
104
|
+
# Only leaving the first and last letters. It replaces the
|
105
|
+
# chars in the middle of the query wiht a %.
|
106
|
+
# Example:
|
107
|
+
# Slovakia
|
108
|
+
# S%a
|
109
|
+
def self.method_6(query)
|
110
|
+
query = [query[0].chr + "%" + query[-1].chr]
|
111
|
+
end
|
112
|
+
|
113
|
+
# Same as above, but it keeps the last two AND first two
|
114
|
+
# chars of the query.
|
115
|
+
# Example:
|
116
|
+
# Slovakia
|
117
|
+
# Sl%ia
|
118
|
+
def self.method_7(query)
|
119
|
+
if query.length == 1
|
120
|
+
return [query]
|
121
|
+
else
|
122
|
+
return [query[0..1] + "%" + query[-2..-1]]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
data/lib/my_segments.rb
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'segments_lexicon'
|
2
|
+
require "my_segments/version"
|
3
|
+
require "my_segments/candidates"
|
4
|
+
require 'my_segments/substring_rules'
|
5
|
+
|
6
|
+
class Segments
|
7
|
+
# Initializes with a segments lexicon
|
8
|
+
def initialize(segments_lexicon)
|
9
|
+
@lex = segments_lexicon
|
10
|
+
end
|
11
|
+
|
12
|
+
# Returns the query_term broken down into
|
13
|
+
# the substring rules (segments) we'll use
|
14
|
+
# for searching
|
15
|
+
def for_term(query_term)
|
16
|
+
SubstringRules.for(query_term)
|
17
|
+
end
|
18
|
+
|
19
|
+
# Takes a query_term, generates the segments
|
20
|
+
# for that term, and searches for those segment
|
21
|
+
# matches in the lexicon.
|
22
|
+
#
|
23
|
+
# Returns a ranked ordered list of candidates
|
24
|
+
def suggest(query_term)
|
25
|
+
@qt = query_term
|
26
|
+
@candidates = Candidates.new
|
27
|
+
srs = for_term(@qt)
|
28
|
+
|
29
|
+
srs.each do |seg|
|
30
|
+
@lex.search(seg).each do |result|
|
31
|
+
found(result)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Run substring rules
|
36
|
+
# Check confidence
|
37
|
+
# Run ngrams
|
38
|
+
# Return most confident candidate set
|
39
|
+
return @candidates
|
40
|
+
end
|
41
|
+
|
42
|
+
private #------
|
43
|
+
|
44
|
+
def found(result)
|
45
|
+
word = result["word"]
|
46
|
+
id = result["id"]
|
47
|
+
if @candidates.has_id?(id)
|
48
|
+
@candidates.vote_for(id, 1.0)
|
49
|
+
else
|
50
|
+
c = Candidate.new(@qt, word, id)
|
51
|
+
@candidates.add(c)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/my_segments.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'segments/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "my_segments"
|
8
|
+
gem.version = Segments::VERSION
|
9
|
+
gem.authors = ["Jason Soo"]
|
10
|
+
gem.email = ["wwwjscom@gmail.com"]
|
11
|
+
gem.description = %q{Segments gem}
|
12
|
+
gem.summary = %q{Segments gem}
|
13
|
+
gem.homepage = ""
|
14
|
+
|
15
|
+
gem.add_runtime_dependency "segments_lexicon"
|
16
|
+
|
17
|
+
gem.files = `git ls-files`.split($/)
|
18
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
19
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
20
|
+
gem.require_paths = ["lib"]
|
21
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require_relative '../lib/segments/substring_rules'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
class TestSubstringRules < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_substrings
|
7
|
+
segs = SubstringRules.for('telephone')
|
8
|
+
assert_instance_of(Array, segs)
|
9
|
+
assert_equal(14, segs.size)
|
10
|
+
end
|
11
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: my_segments
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jason Soo
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-10-14 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: segments_lexicon
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: Segments gem
|
31
|
+
email:
|
32
|
+
- wwwjscom@gmail.com
|
33
|
+
executables: []
|
34
|
+
extensions: []
|
35
|
+
extra_rdoc_files: []
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- Gemfile
|
39
|
+
- LICENSE.txt
|
40
|
+
- README.md
|
41
|
+
- Rakefile
|
42
|
+
- lib/my_segments.rb
|
43
|
+
- lib/my_segments/.candidates.rb.swp
|
44
|
+
- lib/my_segments/candidates.rb
|
45
|
+
- lib/my_segments/segments.rb
|
46
|
+
- lib/my_segments/substring_rules.rb
|
47
|
+
- lib/my_segments/tester.rb
|
48
|
+
- lib/my_segments/version.rb
|
49
|
+
- my_segments.gemspec
|
50
|
+
- test/test_substring_rules.rb
|
51
|
+
homepage: ''
|
52
|
+
licenses: []
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
require_paths:
|
56
|
+
- lib
|
57
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
64
|
+
none: false
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
requirements: []
|
70
|
+
rubyforge_project:
|
71
|
+
rubygems_version: 1.8.24
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Segments gem
|
75
|
+
test_files:
|
76
|
+
- test/test_substring_rules.rb
|