parts 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/lib/parts/tester.rb +85 -0
- data/lib/parts/treebank3.2.txt +29933 -0
- data/lib/parts.rb +151 -0
- metadata +99 -0
data/lib/parts.rb
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
class Array
|
2
|
+
def sum; inject(:+); end;
|
3
|
+
def mean; sum.to_f / size; end;
|
4
|
+
end
|
5
|
+
|
6
|
+
module Parts
|
7
|
+
|
8
|
+
class Tagger
|
9
|
+
|
10
|
+
attr_accessor :bigrams, :words, :tags, :bigram_smoothing, :suffixes
|
11
|
+
|
12
|
+
def initialize sentences
|
13
|
+
# Tag-bigrams are stored such that P(T2|T1) = @bigrams[T1][T2].
|
14
|
+
# Word-tag pairs are stored such that P(W|T) = @words[W][T].
|
15
|
+
# Tags are stored such that @tags[T] = no. of occurences of T.
|
16
|
+
@bigrams = Hash.new { |h, t| h[t] = Hash.new { |h, t| h[t] = 0 } }
|
17
|
+
@words = Hash.new { |h, t| h[t] = Hash.new { |h, t| h[t] = 0 } }
|
18
|
+
@tags = Hash.new { |h, t| h[t] = 0 }
|
19
|
+
@bigram_smoothing = Hash.new { |h, t| h[t] = 0 }
|
20
|
+
@suffixes = Hash.new { |h, t| h[t] = Hash.new { |h, t| h[t] = 0 } }
|
21
|
+
self.load sentences
|
22
|
+
end
|
23
|
+
|
24
|
+
def load sentences
|
25
|
+
# Sentences are passed in as an ordered array of hash maps, where each
|
26
|
+
# hash map represents a word/tag pair, {:word => word, :tag => tag}.
|
27
|
+
|
28
|
+
# We append and prepend a start tag and end tag to the sentence, and
|
29
|
+
# iterate over each bigram in the sentence and increment the relevant
|
30
|
+
# counters accordingly.
|
31
|
+
sentences.each do |sentence|
|
32
|
+
sentence = [{:word => "$start", :tag => "$start"}] + sentence
|
33
|
+
sentence += [{:word => "$end", :tag => "$end"}]
|
34
|
+
sentence.each_cons(2) do |previous, current|
|
35
|
+
@words[current[:word]][current[:tag]] += 1
|
36
|
+
@bigrams[previous[:tag]][current[:tag]] += 1
|
37
|
+
@tags[current[:tag]] += 1
|
38
|
+
(1..4).each do |i|
|
39
|
+
@suffixes[current[:word][-i..-1]][current[:tag]] += 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# For each tag-bigram, we convert its counter value into a probability. We
|
45
|
+
# also take into account the effect add 1 smoothing will have on each tag.
|
46
|
+
@bigrams.each do |tag, grams|
|
47
|
+
total = grams.values.inject(:+)
|
48
|
+
grams.each {|g,n| grams[g] = n.to_f/total}
|
49
|
+
@bigram_smoothing[tag] = 1 / (@tags.length + total)
|
50
|
+
end
|
51
|
+
|
52
|
+
# For each word-tag pair, we convert its counter value into a probability.
|
53
|
+
@words.each do |word, tags|
|
54
|
+
# If a word occurs less than once in the corpora we remove it.
|
55
|
+
if tags.values.sum > 1
|
56
|
+
tags.each {|t,n| tags[t] = n.to_f/@tags[t]}
|
57
|
+
else
|
58
|
+
@words.delete word
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# For each suffix-tag pair, we convert its counter value into a probability.
|
63
|
+
@suffixes.each do |suffix, tags|
|
64
|
+
tags.each {|t,n| tags[t] = n.to_f/@tags[t]}
|
65
|
+
end
|
66
|
+
|
67
|
+
# We have now initialised our two probability measures for tag-bigrams and
|
68
|
+
# word-tag pairs, storing them in hash map data structures for easy
|
69
|
+
# access.
|
70
|
+
end
|
71
|
+
|
72
|
+
def classify sentence
|
73
|
+
# Sentences for classification are passed in as an array of words, e.g.
|
74
|
+
# ["Hello", ",", "world"]. I have adapted the Viterbi algorithm to play to
|
75
|
+
# the strengths of Ruby. That or, it's just an implementation of Viterbi
|
76
|
+
# as I understand it.
|
77
|
+
|
78
|
+
# The variable, paths, will store an array of the most succesful paths up
|
79
|
+
# to all of the possible word-tag pairs for our present word. For example,
|
80
|
+
# if we are currently on the word 'world' from the above example, paths
|
81
|
+
# will store the two highest scoring paths which result in the "NN" and
|
82
|
+
# "NNP" variants of the word 'world.
|
83
|
+
|
84
|
+
# We intialise the first stage of our paths with the start tag, and set
|
85
|
+
# the score to 1. We also add the end tags to our sentence.
|
86
|
+
paths = [{:words => [{:word => "$start", :tag => "$start"}], :score => 1}]
|
87
|
+
sentence += [{:word => "$end", :tag => "$end"}]
|
88
|
+
|
89
|
+
# We iterate over each word in the sentence, initialising a new hash map
|
90
|
+
# for each word, in which we will store the most succesful path up to each
|
91
|
+
# possible tag.
|
92
|
+
sentence.each do |word|
|
93
|
+
new_paths = Hash.new { |h, t| h[t] = {:score => -1} }
|
94
|
+
|
95
|
+
# For each path leading up to the previous word's tags, we now calculate
|
96
|
+
# a new score for how well they lead on to each of our current word's
|
97
|
+
# tags.
|
98
|
+
paths.each do |path|
|
99
|
+
prev_tag = path[:words].last[:tag]
|
100
|
+
tags = @words[word].keys
|
101
|
+
# tags = @bigrams[prev_tag].keys if tags.empty?
|
102
|
+
tags = @tags.keys if tags.empty?
|
103
|
+
|
104
|
+
# For each of our current word's potential tags we generate a new
|
105
|
+
# score. If the score for this is larger than any other scores we have
|
106
|
+
# registered along other paths with this tag, we set it as the highest
|
107
|
+
# achieving path for the tag we are currently looking at. In effect
|
108
|
+
# this prunes our search space. When calculating word_score, in order
|
109
|
+
# to account for unseen words, we distribute the tag likelihood
|
110
|
+
# evenely across all tags. For our bigram score, we introduce the
|
111
|
+
# smoothing for each tag we look at. Bere mind that due to our
|
112
|
+
# initialisation of @bigrams, @bigrams[T1][T2] for a tag T1 or T2
|
113
|
+
# which has not appeared, will always return 0, thus ensuring our
|
114
|
+
# smoothing will always work, even for tags we have no registered a
|
115
|
+
# bigram probability for.
|
116
|
+
tags.each do |tag|
|
117
|
+
word_score = @words[word][tag] != 0 ? @words[word][tag] : classify_unknown(word, tag)
|
118
|
+
bigram_score = @bigram_smoothing[prev_tag] + @bigrams[prev_tag][tag]
|
119
|
+
score = path[:score] * word_score * bigram_score
|
120
|
+
new_paths[tag] = {
|
121
|
+
:words => (path[:words] + [{:word => word, :tag => tag}]),
|
122
|
+
:score => score
|
123
|
+
} if score > new_paths[tag][:score]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Here we update our best paths up until this word, for each of the
|
128
|
+
# word's potential tags.
|
129
|
+
paths = new_paths.values
|
130
|
+
end
|
131
|
+
|
132
|
+
# Having looped over every word, we have now covered the entire sentence,
|
133
|
+
# and need simply pick the highest scoring path. We use [1..-1] to remove
|
134
|
+
# the start word-tag pair from our returned path.
|
135
|
+
return paths.max_by {|a| a[:score]}[:words][1..-2]
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
def classify_unknown word, tag
|
140
|
+
suffixes_weight = [0.05,0.15,0.5,0.3]
|
141
|
+
suffixes_probability = (1..4).map do |i|
|
142
|
+
@suffixes[word[-i..-1]][tag]
|
143
|
+
end
|
144
|
+
probability = suffixes_probability.zip(suffixes_weight).map{|i| i[0] * i[1]}.sum
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
require 'parts/tester'
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: parts
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Joe Root
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-01-16 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: shoulda
|
16
|
+
requirement: &70189868506800 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70189868506800
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: bundler
|
27
|
+
requirement: &70189868506320 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.0.0
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70189868506320
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: jeweler
|
38
|
+
requirement: &70189868505820 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.6.4
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70189868505820
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rcov
|
49
|
+
requirement: &70189868492780 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70189868492780
|
58
|
+
description: ''
|
59
|
+
email: joe@onlysix.co.uk
|
60
|
+
executables: []
|
61
|
+
extensions: []
|
62
|
+
extra_rdoc_files:
|
63
|
+
- LICENSE.txt
|
64
|
+
- README.rdoc
|
65
|
+
files:
|
66
|
+
- lib/parts.rb
|
67
|
+
- lib/parts/tester.rb
|
68
|
+
- lib/parts/treebank3.2.txt
|
69
|
+
- LICENSE.txt
|
70
|
+
- README.rdoc
|
71
|
+
homepage: http://github.com/joeroot/parts
|
72
|
+
licenses:
|
73
|
+
- MIT
|
74
|
+
post_install_message:
|
75
|
+
rdoc_options: []
|
76
|
+
require_paths:
|
77
|
+
- lib
|
78
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
79
|
+
none: false
|
80
|
+
requirements:
|
81
|
+
- - ! '>='
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
84
|
+
segments:
|
85
|
+
- 0
|
86
|
+
hash: -1093201021619199354
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
none: false
|
89
|
+
requirements:
|
90
|
+
- - ! '>='
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 1.8.15
|
96
|
+
signing_key:
|
97
|
+
specification_version: 3
|
98
|
+
summary: A simple viterbi-based part of speech tagger
|
99
|
+
test_files: []
|