tagmemics 0.0.0.beta → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/config/adjectives.txt +1137 -0
- data/config/adjectives.txt.bak +1136 -0
- data/config/articles.txt +3 -0
- data/config/conjunctions.txt +7 -0
- data/config/linking_verbs.txt +28 -0
- data/config/prepositions.txt +202 -0
- data/config/pronouns.txt +53 -0
- data/lib/tagmemics.rb +41 -24
- data/lib/tagmemics/{config.rb → load_data.rb} +2 -2
- data/lib/tagmemics/version.rb +3 -0
- data/lib/tagmemics/word.rb +46 -56
- data/lib/tagmemics/word/confidence.rb +65 -0
- data/lib/tagmemics/word/wordnet.rb +38 -9
- metadata +22 -7
@@ -0,0 +1,65 @@
|
|
1
|
+
module Tagmemics
|
2
|
+
class Word
|
3
|
+
class << self
|
4
|
+
# FIXME: need to scan left, not hard coded index.
|
5
|
+
# Add up each category and derive percentage.
|
6
|
+
def noun_confidence(arr, index)
|
7
|
+
str = arr[index]
|
8
|
+
wordnet_prob = (WordNetMethods.possibilities(str)['noun'] / 1) * 6
|
9
|
+
left_neighbor_article = article_confidence(arr[index - 2]) * 2
|
10
|
+
subtotal = wordnet_prob + left_neighbor_article
|
11
|
+
|
12
|
+
subtotal / 10.0
|
13
|
+
end
|
14
|
+
|
15
|
+
def verb_confidence(str)
|
16
|
+
wordnet_prob = WordNetMethods.wordnet_probability(str, 'verb') * 6
|
17
|
+
subtotal = wordnet_prob
|
18
|
+
|
19
|
+
subtotal / 10.0
|
20
|
+
end
|
21
|
+
|
22
|
+
# UPDATE PARAMETERS
|
23
|
+
def adjective_confidence(str)
|
24
|
+
wordnet_prob = WordNetMethods.wordnet_probability(str, 'adjective') * 6
|
25
|
+
# lneighbor_adjective = 0 * 2
|
26
|
+
# rneighbor_verb = 0 * 2
|
27
|
+
subtotal = wordnet_prob
|
28
|
+
|
29
|
+
subtotal / 10.0
|
30
|
+
end
|
31
|
+
|
32
|
+
def part_of_speech?(constant, str, positive = false)
|
33
|
+
arr = []
|
34
|
+
constant.each do |word|
|
35
|
+
positive = true if word.downcase == str.downcase
|
36
|
+
break if positive
|
37
|
+
end
|
38
|
+
positive
|
39
|
+
end
|
40
|
+
|
41
|
+
def adverb_confidence(str)
|
42
|
+
end
|
43
|
+
|
44
|
+
def linking_verb_confidence(str)
|
45
|
+
part_of_speech?(Tagmemics::WordSet::LINKING_VERBS, str) ? 1.0 : 0.0
|
46
|
+
end
|
47
|
+
|
48
|
+
def article_confidence(str)
|
49
|
+
part_of_speech?(Tagmemics::WordSet::ARTICLES, str) ? 1.0 : 0.0
|
50
|
+
end
|
51
|
+
|
52
|
+
def preposition_confidence(str)
|
53
|
+
part_of_speech?(Tagmemics::WordSet::PREPOSITIONS, str) ? 1.0 : 0.0
|
54
|
+
end
|
55
|
+
|
56
|
+
def pronoun_confidence(str)
|
57
|
+
part_of_speech?(Tagmemics::WordSet::PRONOUNS, str) ? 1.0 : 0.0
|
58
|
+
end
|
59
|
+
|
60
|
+
def conjunction_confidence(str)
|
61
|
+
part_of_speech?(Tagmemics::WordSet::CONJUNCTIONS, str) ? 1.0 : 0.0
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -1,10 +1,8 @@
|
|
1
1
|
require 'wordnet'
|
2
2
|
require 'facets'
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
module Lexicon
|
7
|
-
module WordNet
|
4
|
+
module Tagmemics
|
5
|
+
module WordNetMethods
|
8
6
|
class << self
|
9
7
|
def lex
|
10
8
|
WordNet::Lexicon.new
|
@@ -26,17 +24,48 @@ module Lexicon
|
|
26
24
|
parts_of_speech_frequency(word).values.reduce(:+)
|
27
25
|
end
|
28
26
|
|
29
|
-
|
27
|
+
# returns hash of all possibilities for given dictionary word.
|
28
|
+
def possibilities(word)
|
30
29
|
hsh = parts_of_speech_frequency(word)
|
31
30
|
denom = total_possibilities(word)
|
32
31
|
|
33
32
|
hsh.each { |k, v| hsh[k] = v / denom.to_f }
|
34
33
|
end
|
35
34
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
35
|
+
# Most likely part of speech
|
36
|
+
def most_likely_pos(probability_hsh)
|
37
|
+
return unless probability_hsh.is_a? Hash
|
38
|
+
max = probability_hsh.values.max
|
39
|
+
probability_hsh.select { |_k, v| v == max }
|
40
|
+
end
|
41
|
+
|
42
|
+
####
|
43
|
+
#
|
44
|
+
# DELETE ME?
|
45
|
+
#
|
46
|
+
#
|
47
|
+
def most_likely_probability(hsh)
|
48
|
+
most_likely_pos(hsh).values.reduce(:+)
|
49
|
+
end
|
50
|
+
|
51
|
+
def combine_values(hsh)
|
52
|
+
hsh.values.reduce(:+)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Select highest probable part of speech and combine with any
|
56
|
+
# others with similiar name ie. Adjective and Adjective Satellite,
|
57
|
+
# they will be added together.
|
58
|
+
def wordnet_probability(word, part_of_speech)
|
59
|
+
hsh = possibilities(word)
|
60
|
+
eligibles = hsh.select { |k, _v| k.split.include? part_of_speech }
|
61
|
+
combine_values(eligibles) || 0.0 # return if probability is nil
|
62
|
+
end
|
63
|
+
|
64
|
+
# TODO: Not using this. Delete?
|
65
|
+
def decimal_complete(hsh)
|
66
|
+
total = hsh.length
|
67
|
+
complete = hsh.count { |_k, v| v } # not nil
|
68
|
+
complete / total.to_f
|
40
69
|
end
|
41
70
|
end
|
42
71
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tagmemics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Mason
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: facets
|
@@ -122,7 +122,13 @@ dependencies:
|
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '2.7'
|
125
|
-
description:
|
125
|
+
description: "\n The English language is extremely complicated. We have words
|
126
|
+
that can have multiple\n parts of speech. Natural language processing is difficult
|
127
|
+
because it is hard to\n tell if a word is a noun when it could be a verb or an
|
128
|
+
adjective, etc.\n\n The purpose of this project is to develop an algorithm that,
|
129
|
+
given a sentence string,\n has a ranking system that detects the part of speech
|
130
|
+
of each word.\n\n Why is the useful? Because understanding the correct parts of
|
131
|
+
speech in a sentence\n is the first step to teaching a robot how to read.\n "
|
126
132
|
email: mace2345@gmail.com
|
127
133
|
executables: []
|
128
134
|
extensions: []
|
@@ -130,10 +136,19 @@ extra_rdoc_files: []
|
|
130
136
|
files:
|
131
137
|
- README.md
|
132
138
|
- Rakefile
|
139
|
+
- config/adjectives.txt
|
140
|
+
- config/adjectives.txt.bak
|
141
|
+
- config/articles.txt
|
142
|
+
- config/conjunctions.txt
|
143
|
+
- config/linking_verbs.txt
|
144
|
+
- config/prepositions.txt
|
145
|
+
- config/pronouns.txt
|
133
146
|
- lib/tagmemics.rb
|
134
|
-
- lib/tagmemics/
|
147
|
+
- lib/tagmemics/load_data.rb
|
135
148
|
- lib/tagmemics/sentence.rb
|
149
|
+
- lib/tagmemics/version.rb
|
136
150
|
- lib/tagmemics/word.rb
|
151
|
+
- lib/tagmemics/word/confidence.rb
|
137
152
|
- lib/tagmemics/word/wordnet.rb
|
138
153
|
homepage: http://github.com/m8ss/tagmemics
|
139
154
|
licenses:
|
@@ -150,13 +165,13 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
150
165
|
version: '0'
|
151
166
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
152
167
|
requirements:
|
153
|
-
- - "
|
168
|
+
- - ">="
|
154
169
|
- !ruby/object:Gem::Version
|
155
|
-
version:
|
170
|
+
version: '0'
|
156
171
|
requirements: []
|
157
172
|
rubyforge_project:
|
158
173
|
rubygems_version: 2.4.5
|
159
174
|
signing_key:
|
160
175
|
specification_version: 4
|
161
|
-
summary:
|
176
|
+
summary: Detect parts of speech from a sentence.
|
162
177
|
test_files: []
|