compound_splitter 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +20 -0
- data/.rspec +1 -0
- data/.travis.yml +2 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +67 -0
- data/Rakefile +8 -0
- data/compound_splitter.gemspec +20 -0
- data/lib/compound_splitter/dictionary.rb +55 -0
- data/lib/compound_splitter/splitter.rb +42 -0
- data/lib/compound_splitter/version.rb +3 -0
- data/lib/compound_splitter.rb +9 -0
- data/spec/compound_splitter/dictionary_spec.rb +76 -0
- data/spec/compound_splitter/splitter_spec.rb +23 -0
- data/spec/compound_splitter_spec.rb +10 -0
- data/spec/fixtures/dictionary1.txt +5 -0
- data/spec/spec_helper.rb +1 -0
- metadata +90 -0
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 David Tuite
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
# CompoundSplitter
|
2
|
+
|
3
|
+
Split compoind words into their component parts. For example, 'rainyday' ->
|
4
|
+
'rainy day'.
|
5
|
+
|
6
|
+
CompoundSplitter.split('longwalk')
|
7
|
+
# => ['long', 'walk']
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
gem 'compound_splitter'
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install compound_splitter
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
**Basic Usage**
|
26
|
+
|
27
|
+
splitter = CompoundSplitter::Splitter.new
|
28
|
+
splitter.split('rainyday')
|
29
|
+
# => ['rainy', 'day']
|
30
|
+
|
31
|
+
splitter.split('wickedweather')
|
32
|
+
# => ['wicked', 'weather']
|
33
|
+
|
34
|
+
**Shortcut**
|
35
|
+
|
36
|
+
There is a shortcut `split` method available on the top-level namespace.
|
37
|
+
|
38
|
+
CompoundSplitter.split('longwalk')
|
39
|
+
# => ['long', 'walk']
|
40
|
+
|
41
|
+
The longer version should be used wherever possible since doing so will
|
42
|
+
prevent loading and prepearing of the dictionary multiple times.
|
43
|
+
|
44
|
+
**The Dictionary File**
|
45
|
+
The compound splitter assumes you have a dictionary file
|
46
|
+
in your file system at `/usr/share/dict/words`. If you would like
|
47
|
+
to use a different dictionary file then you can create a new dictionary
|
48
|
+
object and pass it into the splitters initializer.
|
49
|
+
|
50
|
+
dict = CompoundSplitter::Dictionary.new('path/to/dictionary/file')
|
51
|
+
|
52
|
+
splitter = CompoundSplitter::Splitter.new(dict)
|
53
|
+
splitter.split('rainyday')
|
54
|
+
# => ['rainy', 'day']
|
55
|
+
|
56
|
+
## Acknologements
|
57
|
+
|
58
|
+
This gem is basically a translation to Ruby of a [Stack Overflow answer](http://stackoverflow.com/a/481773/574190)
|
59
|
+
by Darius Bacon. The answer was given in python. Thus credit for the implementation should go to Darius.
|
60
|
+
|
61
|
+
## Contributing
|
62
|
+
|
63
|
+
1. Fork it
|
64
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
65
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
66
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
67
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/compound_splitter/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["David Tuite"]
|
6
|
+
gem.email = ["dtuite@gmail.com"]
|
7
|
+
gem.description = %q{Split concatenated words}
|
8
|
+
gem.summary = %q{Split words which have been concatenated together. eg. 'wickedweather' -> 'wicked weather'}
|
9
|
+
gem.homepage = "https://github.com/dtuite/compound_splitter"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "compound_splitter"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = CompoundSplitter::VERSION
|
17
|
+
|
18
|
+
gem.add_development_dependency 'rake'
|
19
|
+
gem.add_development_dependency 'rspec'
|
20
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module CompoundSplitter
|
2
|
+
class Dictionary
|
3
|
+
attr_accessor :file_location
|
4
|
+
|
5
|
+
def initialize(file_location = nil)
|
6
|
+
@file_location = file_location || '/usr/share/dict/words'
|
7
|
+
end
|
8
|
+
|
9
|
+
# Read a file of newline separated words into a downcased array.
|
10
|
+
def words
|
11
|
+
unless @words
|
12
|
+
f = File.read(file_location)
|
13
|
+
@words = []
|
14
|
+
f.each_line { |l| @words << l.chomp.downcase }
|
15
|
+
end
|
16
|
+
@words
|
17
|
+
end
|
18
|
+
|
19
|
+
# Assign the length of the longest word in the dictionary.
|
20
|
+
def max_word_length
|
21
|
+
@max_word_length ||= words.max.length
|
22
|
+
end
|
23
|
+
|
24
|
+
# Assign the total number of words in the dictionary. It's a float
|
25
|
+
# because we're going to divide by it later on.
|
26
|
+
def total_word_count
|
27
|
+
@total_word_count ||= words.length.to_f
|
28
|
+
end
|
29
|
+
|
30
|
+
def ocurrances_hash
|
31
|
+
@ocurrances_hash ||= self.class.count_dupes(words.sort)
|
32
|
+
end
|
33
|
+
|
34
|
+
def [](lookup_word)
|
35
|
+
ocurrances_hash[lookup_word]
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get the probability of a specific word ocurring in the dictionary.
|
39
|
+
def word_prob(word)
|
40
|
+
# Return the number of ocurrances of a word in the dictionary or 0
|
41
|
+
count = self[word] || 0
|
42
|
+
# Divide by the total number of words.
|
43
|
+
count / total_word_count
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
# Turn an array of words into a hash where each word has a key
|
49
|
+
# and each value is the number of ocurrances of the key in the array.
|
50
|
+
# INFO: http://stackoverflow.com/a/5470797/574190
|
51
|
+
def self.count_dupes(words_array)
|
52
|
+
words_array.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module CompoundSplitter
|
2
|
+
class Splitter
|
3
|
+
attr_accessor :dictionary
|
4
|
+
|
5
|
+
def initialize(dictionary = nil)
|
6
|
+
@dictionary = dictionary || Dictionary.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def viterbi_split(compound)
|
10
|
+
return [] if compound.empty?
|
11
|
+
|
12
|
+
probs, lasts = [1.0], [0]
|
13
|
+
|
14
|
+
1.upto(compound.length) do |i|
|
15
|
+
|
16
|
+
biggest = [0, i - dictionary.max_word_length].max
|
17
|
+
|
18
|
+
all_probs = []
|
19
|
+
biggest.upto(i - 1).each do |j|
|
20
|
+
part_of_compound = compound[j..(i-1)]
|
21
|
+
probability_part_is_word = dictionary.word_prob(part_of_compound)
|
22
|
+
something = [(probs[j] || 0.0) * probability_part_is_word, j]
|
23
|
+
all_probs << something
|
24
|
+
end
|
25
|
+
|
26
|
+
probs << all_probs.max[0]
|
27
|
+
lasts << all_probs.max[1]
|
28
|
+
end
|
29
|
+
|
30
|
+
words = []
|
31
|
+
i = compound.length
|
32
|
+
while 0 < i
|
33
|
+
words << compound[lasts[i]..(i-1)]
|
34
|
+
i = lasts[i]
|
35
|
+
end
|
36
|
+
|
37
|
+
# [words.reverse, probs[-1]]
|
38
|
+
words.reverse
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe CompoundSplitter::Dictionary do
|
4
|
+
let(:dict) { File.expand_path('spec/fixtures/dictionary1.txt') }
|
5
|
+
subject { CompoundSplitter::Dictionary.new(dict) }
|
6
|
+
|
7
|
+
|
8
|
+
describe "initialiation" do
|
9
|
+
subject { CompoundSplitter::Dictionary.new }
|
10
|
+
|
11
|
+
it "should have a default dictionary file location" do
|
12
|
+
subject.file_location.should == '/usr/share/dict/words'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "words" do
|
17
|
+
it "should return an array of downcased words" do
|
18
|
+
expected = %w[rainy day help need help]
|
19
|
+
subject.words.should == expected
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should memoize" do
|
23
|
+
File.should_receive(:read).with(dict) { dict }
|
24
|
+
subject.words
|
25
|
+
subject.words
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "max_word_length" do
|
30
|
+
it "should return the length of the longest word in the dictionary" do
|
31
|
+
subject.max_word_length.should == 5
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should memoize"
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "total_word_count" do
|
38
|
+
it "should return the total number of words in the dictionary" do
|
39
|
+
subject.total_word_count.should == 5.0
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe "word_prob" do
|
44
|
+
it "should return the probability of a word being real" do
|
45
|
+
subject.word_prob('help').should == 0.4
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should return 0 for non-existant words" do
|
49
|
+
subject.word_prob('grinnick').should == 0
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe "ocurrances_hash" do
|
54
|
+
it "should return a hash" do
|
55
|
+
subject.ocurrances_hash.should be_instance_of(Hash)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should have words as keys" do
|
59
|
+
subject.ocurrances_hash.keys.should include('help')
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should have word ocurrance counts as values" do
|
63
|
+
subject.ocurrances_hash['help'].should == 2
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should memoize"
|
67
|
+
end
|
68
|
+
|
69
|
+
describe "[]" do
|
70
|
+
it "should lookup words in the ocurrances_hash" do
|
71
|
+
word = "help"
|
72
|
+
subject.ocurrances_hash.should_receive(:[]).with(word)
|
73
|
+
subject[word]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe CompoundSplitter::Splitter do
|
4
|
+
subject { CompoundSplitter::Splitter.new }
|
5
|
+
|
6
|
+
describe "viterbi_split" do
|
7
|
+
it "should return rainy day for rainyday" do
|
8
|
+
subject.viterbi_split('rainyday').should == %w[rainy day]
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should return w for w" do
|
12
|
+
subject.viterbi_split('w').should == %w[w]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return pen island for penisland" do
|
16
|
+
subject.viterbi_split('penisland').should == %w[penis land]
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should do something with ''" do
|
20
|
+
subject.viterbi_split('').should == []
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "compound_splitter"
|
metadata
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: compound_splitter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- David Tuite
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-13 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: &70221998391920 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70221998391920
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rspec
|
27
|
+
requirement: &70221998391360 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70221998391360
|
36
|
+
description: Split concatenated words
|
37
|
+
email:
|
38
|
+
- dtuite@gmail.com
|
39
|
+
executables: []
|
40
|
+
extensions: []
|
41
|
+
extra_rdoc_files: []
|
42
|
+
files:
|
43
|
+
- .gitignore
|
44
|
+
- .rspec
|
45
|
+
- .travis.yml
|
46
|
+
- Gemfile
|
47
|
+
- LICENSE
|
48
|
+
- README.md
|
49
|
+
- Rakefile
|
50
|
+
- compound_splitter.gemspec
|
51
|
+
- lib/compound_splitter.rb
|
52
|
+
- lib/compound_splitter/dictionary.rb
|
53
|
+
- lib/compound_splitter/splitter.rb
|
54
|
+
- lib/compound_splitter/version.rb
|
55
|
+
- spec/compound_splitter/dictionary_spec.rb
|
56
|
+
- spec/compound_splitter/splitter_spec.rb
|
57
|
+
- spec/compound_splitter_spec.rb
|
58
|
+
- spec/fixtures/dictionary1.txt
|
59
|
+
- spec/spec_helper.rb
|
60
|
+
homepage: https://github.com/dtuite/compound_splitter
|
61
|
+
licenses: []
|
62
|
+
post_install_message:
|
63
|
+
rdoc_options: []
|
64
|
+
require_paths:
|
65
|
+
- lib
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
requirements: []
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.8.16
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: Split words which have been concatenated together. eg. 'wickedweather' ->
|
84
|
+
'wicked weather'
|
85
|
+
test_files:
|
86
|
+
- spec/compound_splitter/dictionary_spec.rb
|
87
|
+
- spec/compound_splitter/splitter_spec.rb
|
88
|
+
- spec/compound_splitter_spec.rb
|
89
|
+
- spec/fixtures/dictionary1.txt
|
90
|
+
- spec/spec_helper.rb
|