compound_splitter 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,20 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .DS_Store
19
+ *.swp
20
+ *.swo
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,2 @@
1
+ rvm:
2
+ - 1.9.3
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in compound_splitter.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 David Tuite
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,67 @@
1
+ # CompoundSplitter
2
+
3
+ Split compoind words into their component parts. For example, 'rainyday' ->
4
+ 'rainy day'.
5
+
6
+ CompoundSplitter.split('longwalk')
7
+ # => ['long', 'walk']
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'compound_splitter'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install compound_splitter
22
+
23
+ ## Usage
24
+
25
+ **Basic Usage**
26
+
27
+ splitter = CompoundSplitter::Splitter.new
28
+ splitter.split('rainyday')
29
+ # => ['rainy', 'day']
30
+
31
+ splitter.split('wickedweather')
32
+ # => ['wicked', 'weather']
33
+
34
+ **Shortcut**
35
+
36
+ There is a shortcut `split` method available on the top-level namespace.
37
+
38
+ CompoundSplitter.split('longwalk')
39
+ # => ['long', 'walk']
40
+
41
+ The longer version should be used wherever possible since doing so will
42
+ prevent loading and prepearing of the dictionary multiple times.
43
+
44
+ **The Dictionary File**
45
+ The compound splitter assumes you have a dictionary file
46
+ in your file system at `/usr/share/dict/words`. If you would like
47
+ to use a different dictionary file then you can create a new dictionary
48
+ object and pass it into the splitters initializer.
49
+
50
+ dict = CompoundSplitter::Dictionary.new('path/to/dictionary/file')
51
+
52
+ splitter = CompoundSplitter::Splitter.new(dict)
53
+ splitter.split('rainyday')
54
+ # => ['rainy', 'day']
55
+
56
+ ## Acknologements
57
+
58
+ This gem is basically a translation to Ruby of a [Stack Overflow answer](http://stackoverflow.com/a/481773/574190)
59
+ by Darius Bacon. The answer was given in python. Thus credit for the implementation should go to Darius.
60
+
61
+ ## Contributing
62
+
63
+ 1. Fork it
64
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
65
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
66
+ 4. Push to the branch (`git push origin my-new-feature`)
67
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/compound_splitter/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["David Tuite"]
6
+ gem.email = ["dtuite@gmail.com"]
7
+ gem.description = %q{Split concatenated words}
8
+ gem.summary = %q{Split words which have been concatenated together. eg. 'wickedweather' -> 'wicked weather'}
9
+ gem.homepage = "https://github.com/dtuite/compound_splitter"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "compound_splitter"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = CompoundSplitter::VERSION
17
+
18
+ gem.add_development_dependency 'rake'
19
+ gem.add_development_dependency 'rspec'
20
+ end
@@ -0,0 +1,55 @@
1
+ module CompoundSplitter
2
+ class Dictionary
3
+ attr_accessor :file_location
4
+
5
+ def initialize(file_location = nil)
6
+ @file_location = file_location || '/usr/share/dict/words'
7
+ end
8
+
9
+ # Read a file of newline separated words into a downcased array.
10
+ def words
11
+ unless @words
12
+ f = File.read(file_location)
13
+ @words = []
14
+ f.each_line { |l| @words << l.chomp.downcase }
15
+ end
16
+ @words
17
+ end
18
+
19
+ # Assign the length of the longest word in the dictionary.
20
+ def max_word_length
21
+ @max_word_length ||= words.max.length
22
+ end
23
+
24
+ # Assign the total number of words in the dictionary. It's a float
25
+ # because we're going to divide by it later on.
26
+ def total_word_count
27
+ @total_word_count ||= words.length.to_f
28
+ end
29
+
30
+ def ocurrances_hash
31
+ @ocurrances_hash ||= self.class.count_dupes(words.sort)
32
+ end
33
+
34
+ def [](lookup_word)
35
+ ocurrances_hash[lookup_word]
36
+ end
37
+
38
+ # Get the probability of a specific word ocurring in the dictionary.
39
+ def word_prob(word)
40
+ # Return the number of ocurrances of a word in the dictionary or 0
41
+ count = self[word] || 0
42
+ # Divide by the total number of words.
43
+ count / total_word_count
44
+ end
45
+
46
+ private
47
+
48
+ # Turn an array of words into a hash where each word has a key
49
+ # and each value is the number of ocurrances of the key in the array.
50
+ # INFO: http://stackoverflow.com/a/5470797/574190
51
+ def self.count_dupes(words_array)
52
+ words_array.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h }
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,42 @@
1
+ module CompoundSplitter
2
+ class Splitter
3
+ attr_accessor :dictionary
4
+
5
+ def initialize(dictionary = nil)
6
+ @dictionary = dictionary || Dictionary.new
7
+ end
8
+
9
+ def viterbi_split(compound)
10
+ return [] if compound.empty?
11
+
12
+ probs, lasts = [1.0], [0]
13
+
14
+ 1.upto(compound.length) do |i|
15
+
16
+ biggest = [0, i - dictionary.max_word_length].max
17
+
18
+ all_probs = []
19
+ biggest.upto(i - 1).each do |j|
20
+ part_of_compound = compound[j..(i-1)]
21
+ probability_part_is_word = dictionary.word_prob(part_of_compound)
22
+ something = [(probs[j] || 0.0) * probability_part_is_word, j]
23
+ all_probs << something
24
+ end
25
+
26
+ probs << all_probs.max[0]
27
+ lasts << all_probs.max[1]
28
+ end
29
+
30
+ words = []
31
+ i = compound.length
32
+ while 0 < i
33
+ words << compound[lasts[i]..(i-1)]
34
+ i = lasts[i]
35
+ end
36
+
37
+ # [words.reverse, probs[-1]]
38
+ words.reverse
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,3 @@
1
+ module CompoundSplitter
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,9 @@
1
+ require "compound_splitter/version"
2
+ require "compound_splitter/splitter"
3
+ require "compound_splitter/dictionary"
4
+
5
+ module CompoundSplitter
6
+ def self.split(compound)
7
+ Splitter.viterbi_split(compound)
8
+ end
9
+ end
@@ -0,0 +1,76 @@
1
+ require "spec_helper"
2
+
3
+ describe CompoundSplitter::Dictionary do
4
+ let(:dict) { File.expand_path('spec/fixtures/dictionary1.txt') }
5
+ subject { CompoundSplitter::Dictionary.new(dict) }
6
+
7
+
8
+ describe "initialiation" do
9
+ subject { CompoundSplitter::Dictionary.new }
10
+
11
+ it "should have a default dictionary file location" do
12
+ subject.file_location.should == '/usr/share/dict/words'
13
+ end
14
+ end
15
+
16
+ describe "words" do
17
+ it "should return an array of downcased words" do
18
+ expected = %w[rainy day help need help]
19
+ subject.words.should == expected
20
+ end
21
+
22
+ it "should memoize" do
23
+ File.should_receive(:read).with(dict) { dict }
24
+ subject.words
25
+ subject.words
26
+ end
27
+ end
28
+
29
+ describe "max_word_length" do
30
+ it "should return the length of the longest word in the dictionary" do
31
+ subject.max_word_length.should == 5
32
+ end
33
+
34
+ it "should memoize"
35
+ end
36
+
37
+ describe "total_word_count" do
38
+ it "should return the total number of words in the dictionary" do
39
+ subject.total_word_count.should == 5.0
40
+ end
41
+ end
42
+
43
+ describe "word_prob" do
44
+ it "should return the probability of a word being real" do
45
+ subject.word_prob('help').should == 0.4
46
+ end
47
+
48
+ it "should return 0 for non-existant words" do
49
+ subject.word_prob('grinnick').should == 0
50
+ end
51
+ end
52
+
53
+ describe "ocurrances_hash" do
54
+ it "should return a hash" do
55
+ subject.ocurrances_hash.should be_instance_of(Hash)
56
+ end
57
+
58
+ it "should have words as keys" do
59
+ subject.ocurrances_hash.keys.should include('help')
60
+ end
61
+
62
+ it "should have word ocurrance counts as values" do
63
+ subject.ocurrances_hash['help'].should == 2
64
+ end
65
+
66
+ it "should memoize"
67
+ end
68
+
69
+ describe "[]" do
70
+ it "should lookup words in the ocurrances_hash" do
71
+ word = "help"
72
+ subject.ocurrances_hash.should_receive(:[]).with(word)
73
+ subject[word]
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,23 @@
1
+ require "spec_helper"
2
+
3
+ describe CompoundSplitter::Splitter do
4
+ subject { CompoundSplitter::Splitter.new }
5
+
6
+ describe "viterbi_split" do
7
+ it "should return rainy day for rainyday" do
8
+ subject.viterbi_split('rainyday').should == %w[rainy day]
9
+ end
10
+
11
+ it "should return w for w" do
12
+ subject.viterbi_split('w').should == %w[w]
13
+ end
14
+
15
+ it "should return pen island for penisland" do
16
+ subject.viterbi_split('penisland').should == %w[penis land]
17
+ end
18
+
19
+ it "should do something with ''" do
20
+ subject.viterbi_split('').should == []
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,10 @@
1
+ require "spec_helper"
2
+
3
+ describe CompoundSplitter do
4
+ it "should delegate to splitter" do
5
+ compound = 'wickedweather'
6
+ CompoundSplitter::Splitter.should_receive(:viterbi_split)
7
+ .with(compound)
8
+ CompoundSplitter.split(compound)
9
+ end
10
+ end
@@ -0,0 +1,5 @@
1
+ rainy
2
+ day
3
+ heLp
4
+ Need
5
+ help
@@ -0,0 +1 @@
1
+ require "compound_splitter"
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: compound_splitter
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - David Tuite
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-13 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: &70221998391920 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70221998391920
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ requirement: &70221998391360 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70221998391360
36
+ description: Split concatenated words
37
+ email:
38
+ - dtuite@gmail.com
39
+ executables: []
40
+ extensions: []
41
+ extra_rdoc_files: []
42
+ files:
43
+ - .gitignore
44
+ - .rspec
45
+ - .travis.yml
46
+ - Gemfile
47
+ - LICENSE
48
+ - README.md
49
+ - Rakefile
50
+ - compound_splitter.gemspec
51
+ - lib/compound_splitter.rb
52
+ - lib/compound_splitter/dictionary.rb
53
+ - lib/compound_splitter/splitter.rb
54
+ - lib/compound_splitter/version.rb
55
+ - spec/compound_splitter/dictionary_spec.rb
56
+ - spec/compound_splitter/splitter_spec.rb
57
+ - spec/compound_splitter_spec.rb
58
+ - spec/fixtures/dictionary1.txt
59
+ - spec/spec_helper.rb
60
+ homepage: https://github.com/dtuite/compound_splitter
61
+ licenses: []
62
+ post_install_message:
63
+ rdoc_options: []
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ required_rubygems_version: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ requirements: []
79
+ rubyforge_project:
80
+ rubygems_version: 1.8.16
81
+ signing_key:
82
+ specification_version: 3
83
+ summary: Split words which have been concatenated together. eg. 'wickedweather' ->
84
+ 'wicked weather'
85
+ test_files:
86
+ - spec/compound_splitter/dictionary_spec.rb
87
+ - spec/compound_splitter/splitter_spec.rb
88
+ - spec/compound_splitter_spec.rb
89
+ - spec/fixtures/dictionary1.txt
90
+ - spec/spec_helper.rb