tagmemics 0.0.0.beta → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/config/adjectives.txt +1137 -0
- data/config/adjectives.txt.bak +1136 -0
- data/config/articles.txt +3 -0
- data/config/conjunctions.txt +7 -0
- data/config/linking_verbs.txt +28 -0
- data/config/prepositions.txt +202 -0
- data/config/pronouns.txt +53 -0
- data/lib/tagmemics.rb +41 -24
- data/lib/tagmemics/{config.rb → load_data.rb} +2 -2
- data/lib/tagmemics/version.rb +3 -0
- data/lib/tagmemics/word.rb +46 -56
- data/lib/tagmemics/word/confidence.rb +65 -0
- data/lib/tagmemics/word/wordnet.rb +38 -9
- metadata +22 -7
data/config/articles.txt
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
Be
|
2
|
+
Am
|
3
|
+
Is
|
4
|
+
Are
|
5
|
+
Was
|
6
|
+
Were
|
7
|
+
Has become
|
8
|
+
Could have come
|
9
|
+
Shall be
|
10
|
+
Shall have been
|
11
|
+
Have appeared
|
12
|
+
Should have appeared
|
13
|
+
Will be
|
14
|
+
Will have been
|
15
|
+
Had seemed
|
16
|
+
Should have been
|
17
|
+
Has been
|
18
|
+
Have been
|
19
|
+
Had been
|
20
|
+
Can be
|
21
|
+
May be
|
22
|
+
Might be
|
23
|
+
Should be
|
24
|
+
Could be
|
25
|
+
Become
|
26
|
+
Would be
|
27
|
+
Appear
|
28
|
+
Seem
|
@@ -0,0 +1,202 @@
|
|
1
|
+
'gainst
|
2
|
+
'mongst
|
3
|
+
'neath
|
4
|
+
'twixt
|
5
|
+
abaft
|
6
|
+
abeam
|
7
|
+
aboard
|
8
|
+
about
|
9
|
+
above
|
10
|
+
absent
|
11
|
+
according to
|
12
|
+
across
|
13
|
+
afore
|
14
|
+
after
|
15
|
+
against
|
16
|
+
ago
|
17
|
+
ahead of
|
18
|
+
along
|
19
|
+
alongside
|
20
|
+
amid
|
21
|
+
amidst
|
22
|
+
among
|
23
|
+
amongst
|
24
|
+
anenst
|
25
|
+
anent
|
26
|
+
anti
|
27
|
+
apart
|
28
|
+
apart from
|
29
|
+
apropos
|
30
|
+
apud
|
31
|
+
around
|
32
|
+
as
|
33
|
+
as far as
|
34
|
+
as for
|
35
|
+
as of
|
36
|
+
as opposed to
|
37
|
+
as per
|
38
|
+
as regards
|
39
|
+
as soon as
|
40
|
+
as well as
|
41
|
+
aside
|
42
|
+
aside from
|
43
|
+
astern of
|
44
|
+
astride
|
45
|
+
at
|
46
|
+
at the behest of
|
47
|
+
athwart
|
48
|
+
atop
|
49
|
+
away
|
50
|
+
ayond
|
51
|
+
ayont
|
52
|
+
back to
|
53
|
+
barring
|
54
|
+
because of
|
55
|
+
before
|
56
|
+
behind
|
57
|
+
behither
|
58
|
+
below
|
59
|
+
beneath
|
60
|
+
beside
|
61
|
+
besides
|
62
|
+
between
|
63
|
+
betwixen
|
64
|
+
betwixt
|
65
|
+
beyond
|
66
|
+
biforn
|
67
|
+
but
|
68
|
+
by
|
69
|
+
by means of
|
70
|
+
by virtue of
|
71
|
+
chez
|
72
|
+
circa
|
73
|
+
close to
|
74
|
+
concerning
|
75
|
+
considering
|
76
|
+
contra
|
77
|
+
cum
|
78
|
+
despite
|
79
|
+
down
|
80
|
+
due to
|
81
|
+
during
|
82
|
+
ere
|
83
|
+
except
|
84
|
+
except for
|
85
|
+
excluding
|
86
|
+
failing
|
87
|
+
far from
|
88
|
+
following
|
89
|
+
for
|
90
|
+
for the sake of
|
91
|
+
forby
|
92
|
+
forenenst
|
93
|
+
fornenst
|
94
|
+
fornent
|
95
|
+
from
|
96
|
+
fromward
|
97
|
+
froward
|
98
|
+
frowards
|
99
|
+
gainst
|
100
|
+
given
|
101
|
+
hence
|
102
|
+
in
|
103
|
+
in accordance with
|
104
|
+
in addition to
|
105
|
+
in case of
|
106
|
+
in front of
|
107
|
+
in lieu of
|
108
|
+
in order to
|
109
|
+
in place of
|
110
|
+
in point of
|
111
|
+
in re
|
112
|
+
in spite of
|
113
|
+
in to
|
114
|
+
including
|
115
|
+
inside
|
116
|
+
inside of
|
117
|
+
inside out
|
118
|
+
instead of
|
119
|
+
into
|
120
|
+
left of
|
121
|
+
like
|
122
|
+
mid
|
123
|
+
midst
|
124
|
+
minus
|
125
|
+
modulo
|
126
|
+
near
|
127
|
+
near to
|
128
|
+
neath
|
129
|
+
next
|
130
|
+
next to
|
131
|
+
notwithstanding
|
132
|
+
o'
|
133
|
+
of
|
134
|
+
off
|
135
|
+
on
|
136
|
+
on account of
|
137
|
+
on behalf of
|
138
|
+
on to
|
139
|
+
on top of
|
140
|
+
onto
|
141
|
+
opposite
|
142
|
+
opposite of
|
143
|
+
opposite to
|
144
|
+
out
|
145
|
+
out from
|
146
|
+
out of
|
147
|
+
outside
|
148
|
+
outside of
|
149
|
+
outwith
|
150
|
+
over
|
151
|
+
overthwart
|
152
|
+
owing to
|
153
|
+
pace
|
154
|
+
past
|
155
|
+
per
|
156
|
+
plus
|
157
|
+
prior to
|
158
|
+
pro
|
159
|
+
pursuant to
|
160
|
+
qua
|
161
|
+
rather than
|
162
|
+
re
|
163
|
+
regarding
|
164
|
+
regardless of
|
165
|
+
right of
|
166
|
+
round
|
167
|
+
sans
|
168
|
+
save
|
169
|
+
since
|
170
|
+
subsequent to
|
171
|
+
such as
|
172
|
+
than
|
173
|
+
thanks to
|
174
|
+
that of
|
175
|
+
through
|
176
|
+
throughout
|
177
|
+
till
|
178
|
+
times
|
179
|
+
to
|
180
|
+
tofore
|
181
|
+
toforn
|
182
|
+
toward
|
183
|
+
towards
|
184
|
+
under
|
185
|
+
underneath
|
186
|
+
unlike
|
187
|
+
until
|
188
|
+
unto
|
189
|
+
up
|
190
|
+
up to
|
191
|
+
upon
|
192
|
+
versus
|
193
|
+
via
|
194
|
+
vice
|
195
|
+
with
|
196
|
+
with a view to
|
197
|
+
with regard to
|
198
|
+
with respect to
|
199
|
+
withal
|
200
|
+
within
|
201
|
+
without
|
202
|
+
worth
|
data/config/pronouns.txt
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
I
|
2
|
+
he
|
3
|
+
her
|
4
|
+
hers
|
5
|
+
herself
|
6
|
+
him
|
7
|
+
himself
|
8
|
+
his
|
9
|
+
hisself
|
10
|
+
it
|
11
|
+
its
|
12
|
+
itself
|
13
|
+
me
|
14
|
+
mine
|
15
|
+
my
|
16
|
+
myself
|
17
|
+
one
|
18
|
+
one's
|
19
|
+
oneself
|
20
|
+
our
|
21
|
+
ours
|
22
|
+
ourself
|
23
|
+
ourselves
|
24
|
+
she
|
25
|
+
thee
|
26
|
+
their
|
27
|
+
theirs
|
28
|
+
theirself
|
29
|
+
theirselves
|
30
|
+
them
|
31
|
+
themself
|
32
|
+
themselves
|
33
|
+
they
|
34
|
+
thine
|
35
|
+
thou
|
36
|
+
thy
|
37
|
+
thyself
|
38
|
+
us
|
39
|
+
we
|
40
|
+
who
|
41
|
+
whom
|
42
|
+
whomself
|
43
|
+
whose
|
44
|
+
whoself
|
45
|
+
y'all
|
46
|
+
ye
|
47
|
+
you
|
48
|
+
you all
|
49
|
+
your
|
50
|
+
yours
|
51
|
+
yourself
|
52
|
+
yourselves
|
53
|
+
youse
|
data/lib/tagmemics.rb
CHANGED
@@ -1,35 +1,52 @@
|
|
1
1
|
require_relative './tagmemics/word'
|
2
2
|
require_relative './tagmemics/sentence'
|
3
|
+
require_relative './tagmemics/load_data'
|
3
4
|
|
4
|
-
|
5
|
-
module Lexicon
|
5
|
+
module Tagmemics
|
6
6
|
def self.parse(str)
|
7
|
-
|
7
|
+
WordSet.new(str)
|
8
8
|
end
|
9
|
-
end
|
10
9
|
|
11
|
-
# The output of
|
12
|
-
class
|
13
|
-
|
14
|
-
|
10
|
+
# The output of Tagmemics.parse
|
11
|
+
class WordSet
|
12
|
+
ARTICLES = %w(the an a)
|
13
|
+
CONJUNCTIONS = %w(for and nor but or yet so )
|
14
|
+
LINKING_VERBS = LoadData.contents_to_a('linking_verbs')
|
15
|
+
PRONOUNS = LoadData.contents_to_a('pronouns')
|
16
|
+
PREPOSITIONS = LoadData.contents_to_a('prepositions')
|
15
17
|
|
16
|
-
|
17
|
-
|
18
|
-
end
|
18
|
+
attr_accessor :nouns, :verbs, :articles, :adjectives, :adverbs,
|
19
|
+
:prepositions, :conjunctions, :pronouns, :collection
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
21
|
+
def initialize(str)
|
22
|
+
@collection = []
|
23
|
+
arr = WordSet.sentence_to_array(str)
|
24
|
+
arr.each { |word| @collection << Word.new(word) }
|
25
|
+
# @set = WordSet.start_hash(WordSet.sentence_to_array(str))
|
26
|
+
end
|
27
|
+
|
28
|
+
class << self
|
29
|
+
include LoadData
|
30
|
+
|
31
|
+
# Will probably want to use punctuation in the future.
|
32
|
+
# For now, this removes it.
|
33
|
+
def sentence_to_array(sentence)
|
34
|
+
sentence.split(/\s+|\W+\z/)
|
35
|
+
end
|
23
36
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
37
|
+
# Moved part of speech. This will not work right now.
|
38
|
+
# Probably need to delete this.
|
39
|
+
def start_hash(arr)
|
40
|
+
arr.map do |word|
|
41
|
+
result =
|
42
|
+
case
|
43
|
+
when part_of_speech(ARTICLES, word).any? then :article
|
44
|
+
when part_of_speech(CONJUNCTIONS, word).any? then :conjunction
|
45
|
+
when part_of_speech(PRONOUNS, word).any? then :pronoun
|
46
|
+
end
|
47
|
+
[word, result]
|
48
|
+
end.to_h
|
49
|
+
end
|
50
|
+
end
|
34
51
|
end
|
35
52
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# Retrieves data from config folder to save to constants.
|
2
|
-
module
|
2
|
+
module LoadData
|
3
3
|
def self.config_path
|
4
4
|
File.join(File.dirname(__FILE__), '../../config')
|
5
5
|
end
|
@@ -24,7 +24,7 @@ module Config
|
|
24
24
|
page = agent.get(uri)
|
25
25
|
destination = "./config/#{part_of_speech}.txt"
|
26
26
|
target = page.search(css_selector)
|
27
|
-
regx = /[
|
27
|
+
regx = /[^\047a-zA-Z\s]/ # \047 is an apostrophe
|
28
28
|
|
29
29
|
arr = []
|
30
30
|
target.each do |x|
|
data/lib/tagmemics/word.rb
CHANGED
@@ -1,69 +1,59 @@
|
|
1
1
|
require 'wordnet'
|
2
2
|
require 'facets'
|
3
|
-
require_relative './
|
3
|
+
require_relative './load_data'
|
4
4
|
require_relative './word/wordnet'
|
5
|
+
require_relative './word/confidence'
|
5
6
|
|
6
|
-
module
|
7
|
+
module Tagmemics
|
7
8
|
class Word
|
8
|
-
include Config
|
9
|
-
|
10
|
-
ARTICLES = %w(the an a)
|
11
|
-
CONJUNCTIONS = %w(for and nor but or yet so )
|
12
|
-
PRONOUNS = Config.contents_to_a('pronouns')
|
13
|
-
|
14
|
-
|
15
|
-
def part_of_speech(constant, str)
|
16
|
-
arr = []
|
17
|
-
constant.each do |word|
|
18
|
-
regx = /\b#{word}\b/i
|
19
|
-
arr << word if regx =~ str # word phrase matches
|
20
|
-
end
|
21
|
-
arr
|
22
|
-
end
|
23
|
-
|
24
|
-
def decimal_complete(hsh)
|
25
|
-
total = hsh.length
|
26
|
-
complete = hsh.count { |_k, v| v } # not nil
|
27
|
-
complete / total.to_f
|
28
|
-
end
|
29
|
-
|
30
9
|
def initialize(word)
|
31
|
-
@
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
def verb_confidence(str)
|
52
|
-
end
|
53
|
-
|
54
|
-
def adjective_confidence(str)
|
55
|
-
end
|
10
|
+
@str = word
|
11
|
+
puts "examining: #{word}"
|
12
|
+
@tagmemic_confidence = Word.confidence_levels(word)
|
13
|
+
end
|
14
|
+
|
15
|
+
class << self
|
16
|
+
##
|
17
|
+
# Because WordNet only tracks verbs, nouns, adverbs and adjectives,
|
18
|
+
# confidence levels can only be updated for those values. The other words
|
19
|
+
# such as pronouns, prepositions, and conjunctions are based off of list
|
20
|
+
# in config folder. Their score is pass or fail and is
|
21
|
+
# calculated as 0 or 1.0.
|
22
|
+
def confidence_levels(word)
|
23
|
+
word = word.downcase
|
24
|
+
known_hsh = determine_known_words(word)
|
25
|
+
wordnet_hsh = determine_wordnet_words(word)
|
26
|
+
|
27
|
+
hsh = everything_nil(known_hsh) ? wordnet_hsh : known_hsh
|
28
|
+
delete_nogos(hsh)
|
29
|
+
end
|
56
30
|
|
57
|
-
|
58
|
-
|
31
|
+
def determine_known_words(word)
|
32
|
+
{
|
33
|
+
:article => article_confidence(word),
|
34
|
+
:preposition => preposition_confidence(word),
|
35
|
+
:pronoun => pronoun_confidence(word),
|
36
|
+
:conjunction => conjunction_confidence(word),
|
37
|
+
:linking_verb => linking_verb_confidence(word)
|
38
|
+
}
|
39
|
+
end
|
59
40
|
|
60
|
-
|
61
|
-
|
41
|
+
def everything_nil(hsh)
|
42
|
+
(hsh.select { |_k, v| v != 0.0 && !v.nil? }.empty?)
|
43
|
+
end
|
62
44
|
|
63
|
-
|
64
|
-
|
45
|
+
def delete_nogos(hsh)
|
46
|
+
hsh.delete_if { |_k, v| v == 0.0 || v.nil? }
|
47
|
+
end
|
65
48
|
|
66
|
-
|
49
|
+
def determine_wordnet_words(word)
|
50
|
+
{
|
51
|
+
:noun => WordNetMethods.wordnet_probability(word, 'noun'),
|
52
|
+
:verb => WordNetMethods.wordnet_probability(word, 'verb'),
|
53
|
+
:adjective => WordNetMethods.wordnet_probability(word, 'adjective'),
|
54
|
+
:adverb => WordNetMethods.wordnet_probability(word, 'adverb')
|
55
|
+
}
|
56
|
+
end
|
67
57
|
end
|
68
58
|
end
|
69
59
|
end
|