parts 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/parts.rb ADDED
@@ -0,0 +1,151 @@
1
+ class Array
2
+ def sum; inject(:+); end;
3
+ def mean; sum.to_f / size; end;
4
+ end
5
+
6
+ module Parts
7
+
8
+ class Tagger
9
+
10
+ attr_accessor :bigrams, :words, :tags, :bigram_smoothing, :suffixes
11
+
12
+ def initialize sentences
13
+ # Tag-bigrams are stored such that P(T2|T1) = @bigrams[T1][T2].
14
+ # Word-tag pairs are stored such that P(W|T) = @words[W][T].
15
+ # Tags are stored such that @tags[T] = no. of occurences of T.
16
+ @bigrams = Hash.new { |h, t| h[t] = Hash.new { |h, t| h[t] = 0 } }
17
+ @words = Hash.new { |h, t| h[t] = Hash.new { |h, t| h[t] = 0 } }
18
+ @tags = Hash.new { |h, t| h[t] = 0 }
19
+ @bigram_smoothing = Hash.new { |h, t| h[t] = 0 }
20
+ @suffixes = Hash.new { |h, t| h[t] = Hash.new { |h, t| h[t] = 0 } }
21
+ self.load sentences
22
+ end
23
+
24
+ def load sentences
25
+ # Sentences are passed in as an ordered array of hash maps, where each
26
+ # hash map represents a word/tag pair, {:word => word, :tag => tag}.
27
+
28
+ # We append and prepend a start tag and end tag to the sentence, and
29
+ # iterate over each bigram in the sentence and increment the relevant
30
+ # counters accordingly.
31
+ sentences.each do |sentence|
32
+ sentence = [{:word => "$start", :tag => "$start"}] + sentence
33
+ sentence += [{:word => "$end", :tag => "$end"}]
34
+ sentence.each_cons(2) do |previous, current|
35
+ @words[current[:word]][current[:tag]] += 1
36
+ @bigrams[previous[:tag]][current[:tag]] += 1
37
+ @tags[current[:tag]] += 1
38
+ (1..4).each do |i|
39
+ @suffixes[current[:word][-i..-1]][current[:tag]] += 1
40
+ end
41
+ end
42
+ end
43
+
44
+ # For each tag-bigram, we convert its counter value into a probability. We
45
+ # also take into account the effect add 1 smoothing will have on each tag.
46
+ @bigrams.each do |tag, grams|
47
+ total = grams.values.inject(:+)
48
+ grams.each {|g,n| grams[g] = n.to_f/total}
49
+ @bigram_smoothing[tag] = 1 / (@tags.length + total)
50
+ end
51
+
52
+ # For each word-tag pair, we convert its counter value into a probability.
53
+ @words.each do |word, tags|
54
+ # If a word occurs less than once in the corpora we remove it.
55
+ if tags.values.sum > 1
56
+ tags.each {|t,n| tags[t] = n.to_f/@tags[t]}
57
+ else
58
+ @words.delete word
59
+ end
60
+ end
61
+
62
+ # For each suffix-tag pair, we convert its counter value into a probability.
63
+ @suffixes.each do |suffix, tags|
64
+ tags.each {|t,n| tags[t] = n.to_f/@tags[t]}
65
+ end
66
+
67
+ # We have now initialised our two probability measures for tag-bigrams and
68
+ # word-tag pairs, storing them in hash map data structures for easy
69
+ # access.
70
+ end
71
+
72
+ def classify sentence
73
+ # Sentences for classification are passed in as an array of words, e.g.
74
+ # ["Hello", ",", "world"]. I have adapted the Viterbi algorithm to play to
75
+ # the strengths of Ruby. That or, it's just an implementation of Viterbi
76
+ # as I understand it.
77
+
78
+ # The variable, paths, will store an array of the most succesful paths up
79
+ # to all of the possible word-tag pairs for our present word. For example,
80
+ # if we are currently on the word 'world' from the above example, paths
81
+ # will store the two highest scoring paths which result in the "NN" and
82
+ # "NNP" variants of the word 'world.
83
+
84
+ # We intialise the first stage of our paths with the start tag, and set
85
+ # the score to 1. We also add the end tags to our sentence.
86
+ paths = [{:words => [{:word => "$start", :tag => "$start"}], :score => 1}]
87
+ sentence += [{:word => "$end", :tag => "$end"}]
88
+
89
+ # We iterate over each word in the sentence, initialising a new hash map
90
+ # for each word, in which we will store the most succesful path up to each
91
+ # possible tag.
92
+ sentence.each do |word|
93
+ new_paths = Hash.new { |h, t| h[t] = {:score => -1} }
94
+
95
+ # For each path leading up to the previous word's tags, we now calculate
96
+ # a new score for how well they lead on to each of our current word's
97
+ # tags.
98
+ paths.each do |path|
99
+ prev_tag = path[:words].last[:tag]
100
+ tags = @words[word].keys
101
+ # tags = @bigrams[prev_tag].keys if tags.empty?
102
+ tags = @tags.keys if tags.empty?
103
+
104
+ # For each of our current word's potential tags we generate a new
105
+ # score. If the score for this is larger than any other scores we have
106
+ # registered along other paths with this tag, we set it as the highest
107
+ # achieving path for the tag we are currently looking at. In effect
108
+ # this prunes our search space. When calculating word_score, in order
109
+ # to account for unseen words, we distribute the tag likelihood
110
+ # evenely across all tags. For our bigram score, we introduce the
111
+ # smoothing for each tag we look at. Bere mind that due to our
112
+ # initialisation of @bigrams, @bigrams[T1][T2] for a tag T1 or T2
113
+ # which has not appeared, will always return 0, thus ensuring our
114
+ # smoothing will always work, even for tags we have no registered a
115
+ # bigram probability for.
116
+ tags.each do |tag|
117
+ word_score = @words[word][tag] != 0 ? @words[word][tag] : classify_unknown(word, tag)
118
+ bigram_score = @bigram_smoothing[prev_tag] + @bigrams[prev_tag][tag]
119
+ score = path[:score] * word_score * bigram_score
120
+ new_paths[tag] = {
121
+ :words => (path[:words] + [{:word => word, :tag => tag}]),
122
+ :score => score
123
+ } if score > new_paths[tag][:score]
124
+ end
125
+ end
126
+
127
+ # Here we update our best paths up until this word, for each of the
128
+ # word's potential tags.
129
+ paths = new_paths.values
130
+ end
131
+
132
+ # Having looped over every word, we have now covered the entire sentence,
133
+ # and need simply pick the highest scoring path. We use [1..-1] to remove
134
+ # the start word-tag pair from our returned path.
135
+ return paths.max_by {|a| a[:score]}[:words][1..-2]
136
+
137
+ end
138
+
139
+ def classify_unknown word, tag
140
+ suffixes_weight = [0.05,0.15,0.5,0.3]
141
+ suffixes_probability = (1..4).map do |i|
142
+ @suffixes[word[-i..-1]][tag]
143
+ end
144
+ probability = suffixes_probability.zip(suffixes_weight).map{|i| i[0] * i[1]}.sum
145
+ end
146
+
147
+ end
148
+
149
+ end
150
+
151
+ require 'parts/tester'
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parts
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Joe Root
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-01-16 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: shoulda
16
+ requirement: &70189868506800 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70189868506800
25
+ - !ruby/object:Gem::Dependency
26
+ name: bundler
27
+ requirement: &70189868506320 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 1.0.0
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70189868506320
36
+ - !ruby/object:Gem::Dependency
37
+ name: jeweler
38
+ requirement: &70189868505820 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 1.6.4
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70189868505820
47
+ - !ruby/object:Gem::Dependency
48
+ name: rcov
49
+ requirement: &70189868492780 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :development
56
+ prerelease: false
57
+ version_requirements: *70189868492780
58
+ description: ''
59
+ email: joe@onlysix.co.uk
60
+ executables: []
61
+ extensions: []
62
+ extra_rdoc_files:
63
+ - LICENSE.txt
64
+ - README.rdoc
65
+ files:
66
+ - lib/parts.rb
67
+ - lib/parts/tester.rb
68
+ - lib/parts/treebank3.2.txt
69
+ - LICENSE.txt
70
+ - README.rdoc
71
+ homepage: http://github.com/joeroot/parts
72
+ licenses:
73
+ - MIT
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ none: false
80
+ requirements:
81
+ - - ! '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ segments:
85
+ - 0
86
+ hash: -1093201021619199354
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ none: false
89
+ requirements:
90
+ - - ! '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 1.8.15
96
+ signing_key:
97
+ specification_version: 3
98
+ summary: A simple viterbi-based part of speech tagger
99
+ test_files: []