compound_splitter 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +20 -0
- data/.rspec +1 -0
- data/.travis.yml +2 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +67 -0
- data/Rakefile +8 -0
- data/compound_splitter.gemspec +20 -0
- data/lib/compound_splitter/dictionary.rb +55 -0
- data/lib/compound_splitter/splitter.rb +42 -0
- data/lib/compound_splitter/version.rb +3 -0
- data/lib/compound_splitter.rb +9 -0
- data/spec/compound_splitter/dictionary_spec.rb +76 -0
- data/spec/compound_splitter/splitter_spec.rb +23 -0
- data/spec/compound_splitter_spec.rb +10 -0
- data/spec/fixtures/dictionary1.txt +5 -0
- data/spec/spec_helper.rb +1 -0
- metadata +90 -0
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 David Tuite
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
# CompoundSplitter
|
2
|
+
|
3
|
+
Split compoind words into their component parts. For example, 'rainyday' ->
|
4
|
+
'rainy day'.
|
5
|
+
|
6
|
+
CompoundSplitter.split('longwalk')
|
7
|
+
# => ['long', 'walk']
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
gem 'compound_splitter'
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install compound_splitter
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
**Basic Usage**
|
26
|
+
|
27
|
+
splitter = CompoundSplitter::Splitter.new
|
28
|
+
splitter.split('rainyday')
|
29
|
+
# => ['rainy', 'day']
|
30
|
+
|
31
|
+
splitter.split('wickedweather')
|
32
|
+
# => ['wicked', 'weather']
|
33
|
+
|
34
|
+
**Shortcut**
|
35
|
+
|
36
|
+
There is a shortcut `split` method available on the top-level namespace.
|
37
|
+
|
38
|
+
CompoundSplitter.split('longwalk')
|
39
|
+
# => ['long', 'walk']
|
40
|
+
|
41
|
+
The longer version should be used wherever possible since doing so will
|
42
|
+
prevent loading and prepearing of the dictionary multiple times.
|
43
|
+
|
44
|
+
**The Dictionary File**
|
45
|
+
The compound splitter assumes you have a dictionary file
|
46
|
+
in your file system at `/usr/share/dict/words`. If you would like
|
47
|
+
to use a different dictionary file then you can create a new dictionary
|
48
|
+
object and pass it into the splitters initializer.
|
49
|
+
|
50
|
+
dict = CompoundSplitter::Dictionary.new('path/to/dictionary/file')
|
51
|
+
|
52
|
+
splitter = CompoundSplitter::Splitter.new(dict)
|
53
|
+
splitter.split('rainyday')
|
54
|
+
# => ['rainy', 'day']
|
55
|
+
|
56
|
+
## Acknologements
|
57
|
+
|
58
|
+
This gem is basically a translation to Ruby of a [Stack Overflow answer](http://stackoverflow.com/a/481773/574190)
|
59
|
+
by Darius Bacon. The answer was given in python. Thus credit for the implementation should go to Darius.
|
60
|
+
|
61
|
+
## Contributing
|
62
|
+
|
63
|
+
1. Fork it
|
64
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
65
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
66
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
67
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/compound_splitter/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["David Tuite"]
|
6
|
+
gem.email = ["dtuite@gmail.com"]
|
7
|
+
gem.description = %q{Split concatenated words}
|
8
|
+
gem.summary = %q{Split words which have been concatenated together. eg. 'wickedweather' -> 'wicked weather'}
|
9
|
+
gem.homepage = "https://github.com/dtuite/compound_splitter"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "compound_splitter"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = CompoundSplitter::VERSION
|
17
|
+
|
18
|
+
gem.add_development_dependency 'rake'
|
19
|
+
gem.add_development_dependency 'rspec'
|
20
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module CompoundSplitter
|
2
|
+
class Dictionary
|
3
|
+
attr_accessor :file_location
|
4
|
+
|
5
|
+
def initialize(file_location = nil)
|
6
|
+
@file_location = file_location || '/usr/share/dict/words'
|
7
|
+
end
|
8
|
+
|
9
|
+
# Read a file of newline separated words into a downcased array.
|
10
|
+
def words
|
11
|
+
unless @words
|
12
|
+
f = File.read(file_location)
|
13
|
+
@words = []
|
14
|
+
f.each_line { |l| @words << l.chomp.downcase }
|
15
|
+
end
|
16
|
+
@words
|
17
|
+
end
|
18
|
+
|
19
|
+
# Assign the length of the longest word in the dictionary.
|
20
|
+
def max_word_length
|
21
|
+
@max_word_length ||= words.max.length
|
22
|
+
end
|
23
|
+
|
24
|
+
# Assign the total number of words in the dictionary. It's a float
|
25
|
+
# because we're going to divide by it later on.
|
26
|
+
def total_word_count
|
27
|
+
@total_word_count ||= words.length.to_f
|
28
|
+
end
|
29
|
+
|
30
|
+
def ocurrances_hash
|
31
|
+
@ocurrances_hash ||= self.class.count_dupes(words.sort)
|
32
|
+
end
|
33
|
+
|
34
|
+
def [](lookup_word)
|
35
|
+
ocurrances_hash[lookup_word]
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get the probability of a specific word ocurring in the dictionary.
|
39
|
+
def word_prob(word)
|
40
|
+
# Return the number of ocurrances of a word in the dictionary or 0
|
41
|
+
count = self[word] || 0
|
42
|
+
# Divide by the total number of words.
|
43
|
+
count / total_word_count
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
# Turn an array of words into a hash where each word has a key
|
49
|
+
# and each value is the number of ocurrances of the key in the array.
|
50
|
+
# INFO: http://stackoverflow.com/a/5470797/574190
|
51
|
+
def self.count_dupes(words_array)
|
52
|
+
words_array.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module CompoundSplitter
|
2
|
+
class Splitter
|
3
|
+
attr_accessor :dictionary
|
4
|
+
|
5
|
+
def initialize(dictionary = nil)
|
6
|
+
@dictionary = dictionary || Dictionary.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def viterbi_split(compound)
|
10
|
+
return [] if compound.empty?
|
11
|
+
|
12
|
+
probs, lasts = [1.0], [0]
|
13
|
+
|
14
|
+
1.upto(compound.length) do |i|
|
15
|
+
|
16
|
+
biggest = [0, i - dictionary.max_word_length].max
|
17
|
+
|
18
|
+
all_probs = []
|
19
|
+
biggest.upto(i - 1).each do |j|
|
20
|
+
part_of_compound = compound[j..(i-1)]
|
21
|
+
probability_part_is_word = dictionary.word_prob(part_of_compound)
|
22
|
+
something = [(probs[j] || 0.0) * probability_part_is_word, j]
|
23
|
+
all_probs << something
|
24
|
+
end
|
25
|
+
|
26
|
+
probs << all_probs.max[0]
|
27
|
+
lasts << all_probs.max[1]
|
28
|
+
end
|
29
|
+
|
30
|
+
words = []
|
31
|
+
i = compound.length
|
32
|
+
while 0 < i
|
33
|
+
words << compound[lasts[i]..(i-1)]
|
34
|
+
i = lasts[i]
|
35
|
+
end
|
36
|
+
|
37
|
+
# [words.reverse, probs[-1]]
|
38
|
+
words.reverse
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe CompoundSplitter::Dictionary do
|
4
|
+
let(:dict) { File.expand_path('spec/fixtures/dictionary1.txt') }
|
5
|
+
subject { CompoundSplitter::Dictionary.new(dict) }
|
6
|
+
|
7
|
+
|
8
|
+
describe "initialiation" do
|
9
|
+
subject { CompoundSplitter::Dictionary.new }
|
10
|
+
|
11
|
+
it "should have a default dictionary file location" do
|
12
|
+
subject.file_location.should == '/usr/share/dict/words'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "words" do
|
17
|
+
it "should return an array of downcased words" do
|
18
|
+
expected = %w[rainy day help need help]
|
19
|
+
subject.words.should == expected
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should memoize" do
|
23
|
+
File.should_receive(:read).with(dict) { dict }
|
24
|
+
subject.words
|
25
|
+
subject.words
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "max_word_length" do
|
30
|
+
it "should return the length of the longest word in the dictionary" do
|
31
|
+
subject.max_word_length.should == 5
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should memoize"
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "total_word_count" do
|
38
|
+
it "should return the total number of words in the dictionary" do
|
39
|
+
subject.total_word_count.should == 5.0
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe "word_prob" do
|
44
|
+
it "should return the probability of a word being real" do
|
45
|
+
subject.word_prob('help').should == 0.4
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should return 0 for non-existant words" do
|
49
|
+
subject.word_prob('grinnick').should == 0
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
describe "ocurrances_hash" do
|
54
|
+
it "should return a hash" do
|
55
|
+
subject.ocurrances_hash.should be_instance_of(Hash)
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should have words as keys" do
|
59
|
+
subject.ocurrances_hash.keys.should include('help')
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should have word ocurrance counts as values" do
|
63
|
+
subject.ocurrances_hash['help'].should == 2
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should memoize"
|
67
|
+
end
|
68
|
+
|
69
|
+
describe "[]" do
|
70
|
+
it "should lookup words in the ocurrances_hash" do
|
71
|
+
word = "help"
|
72
|
+
subject.ocurrances_hash.should_receive(:[]).with(word)
|
73
|
+
subject[word]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe CompoundSplitter::Splitter do
|
4
|
+
subject { CompoundSplitter::Splitter.new }
|
5
|
+
|
6
|
+
describe "viterbi_split" do
|
7
|
+
it "should return rainy day for rainyday" do
|
8
|
+
subject.viterbi_split('rainyday').should == %w[rainy day]
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should return w for w" do
|
12
|
+
subject.viterbi_split('w').should == %w[w]
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return pen island for penisland" do
|
16
|
+
subject.viterbi_split('penisland').should == %w[penis land]
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should do something with ''" do
|
20
|
+
subject.viterbi_split('').should == []
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "compound_splitter"
|
metadata
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: compound_splitter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- David Tuite
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-13 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: &70221998391920 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70221998391920
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rspec
|
27
|
+
requirement: &70221998391360 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70221998391360
|
36
|
+
description: Split concatenated words
|
37
|
+
email:
|
38
|
+
- dtuite@gmail.com
|
39
|
+
executables: []
|
40
|
+
extensions: []
|
41
|
+
extra_rdoc_files: []
|
42
|
+
files:
|
43
|
+
- .gitignore
|
44
|
+
- .rspec
|
45
|
+
- .travis.yml
|
46
|
+
- Gemfile
|
47
|
+
- LICENSE
|
48
|
+
- README.md
|
49
|
+
- Rakefile
|
50
|
+
- compound_splitter.gemspec
|
51
|
+
- lib/compound_splitter.rb
|
52
|
+
- lib/compound_splitter/dictionary.rb
|
53
|
+
- lib/compound_splitter/splitter.rb
|
54
|
+
- lib/compound_splitter/version.rb
|
55
|
+
- spec/compound_splitter/dictionary_spec.rb
|
56
|
+
- spec/compound_splitter/splitter_spec.rb
|
57
|
+
- spec/compound_splitter_spec.rb
|
58
|
+
- spec/fixtures/dictionary1.txt
|
59
|
+
- spec/spec_helper.rb
|
60
|
+
homepage: https://github.com/dtuite/compound_splitter
|
61
|
+
licenses: []
|
62
|
+
post_install_message:
|
63
|
+
rdoc_options: []
|
64
|
+
require_paths:
|
65
|
+
- lib
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
requirements: []
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.8.16
|
81
|
+
signing_key:
|
82
|
+
specification_version: 3
|
83
|
+
summary: Split words which have been concatenated together. eg. 'wickedweather' ->
|
84
|
+
'wicked weather'
|
85
|
+
test_files:
|
86
|
+
- spec/compound_splitter/dictionary_spec.rb
|
87
|
+
- spec/compound_splitter/splitter_spec.rb
|
88
|
+
- spec/compound_splitter_spec.rb
|
89
|
+
- spec/fixtures/dictionary1.txt
|
90
|
+
- spec/spec_helper.rb
|