tagmemics 0.0.0.beta → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/config/adjectives.txt +1137 -0
- data/config/adjectives.txt.bak +1136 -0
- data/config/articles.txt +3 -0
- data/config/conjunctions.txt +7 -0
- data/config/linking_verbs.txt +28 -0
- data/config/prepositions.txt +202 -0
- data/config/pronouns.txt +53 -0
- data/lib/tagmemics.rb +41 -24
- data/lib/tagmemics/{config.rb → load_data.rb} +2 -2
- data/lib/tagmemics/version.rb +3 -0
- data/lib/tagmemics/word.rb +46 -56
- data/lib/tagmemics/word/confidence.rb +65 -0
- data/lib/tagmemics/word/wordnet.rb +38 -9
- metadata +22 -7
data/config/articles.txt
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
Be
|
2
|
+
Am
|
3
|
+
Is
|
4
|
+
Are
|
5
|
+
Was
|
6
|
+
Were
|
7
|
+
Has become
|
8
|
+
Could have come
|
9
|
+
Shall be
|
10
|
+
Shall have been
|
11
|
+
Have appeared
|
12
|
+
Should have appeared
|
13
|
+
Will be
|
14
|
+
Will have been
|
15
|
+
Had seemed
|
16
|
+
Should have been
|
17
|
+
Has been
|
18
|
+
Have been
|
19
|
+
Had been
|
20
|
+
Can be
|
21
|
+
May be
|
22
|
+
Might be
|
23
|
+
Should be
|
24
|
+
Could be
|
25
|
+
Become
|
26
|
+
Would be
|
27
|
+
Appear
|
28
|
+
Seem
|
@@ -0,0 +1,202 @@
|
|
1
|
+
'gainst
|
2
|
+
'mongst
|
3
|
+
'neath
|
4
|
+
'twixt
|
5
|
+
abaft
|
6
|
+
abeam
|
7
|
+
aboard
|
8
|
+
about
|
9
|
+
above
|
10
|
+
absent
|
11
|
+
according to
|
12
|
+
across
|
13
|
+
afore
|
14
|
+
after
|
15
|
+
against
|
16
|
+
ago
|
17
|
+
ahead of
|
18
|
+
along
|
19
|
+
alongside
|
20
|
+
amid
|
21
|
+
amidst
|
22
|
+
among
|
23
|
+
amongst
|
24
|
+
anenst
|
25
|
+
anent
|
26
|
+
anti
|
27
|
+
apart
|
28
|
+
apart from
|
29
|
+
apropos
|
30
|
+
apud
|
31
|
+
around
|
32
|
+
as
|
33
|
+
as far as
|
34
|
+
as for
|
35
|
+
as of
|
36
|
+
as opposed to
|
37
|
+
as per
|
38
|
+
as regards
|
39
|
+
as soon as
|
40
|
+
as well as
|
41
|
+
aside
|
42
|
+
aside from
|
43
|
+
astern of
|
44
|
+
astride
|
45
|
+
at
|
46
|
+
at the behest of
|
47
|
+
athwart
|
48
|
+
atop
|
49
|
+
away
|
50
|
+
ayond
|
51
|
+
ayont
|
52
|
+
back to
|
53
|
+
barring
|
54
|
+
because of
|
55
|
+
before
|
56
|
+
behind
|
57
|
+
behither
|
58
|
+
below
|
59
|
+
beneath
|
60
|
+
beside
|
61
|
+
besides
|
62
|
+
between
|
63
|
+
betwixen
|
64
|
+
betwixt
|
65
|
+
beyond
|
66
|
+
biforn
|
67
|
+
but
|
68
|
+
by
|
69
|
+
by means of
|
70
|
+
by virtue of
|
71
|
+
chez
|
72
|
+
circa
|
73
|
+
close to
|
74
|
+
concerning
|
75
|
+
considering
|
76
|
+
contra
|
77
|
+
cum
|
78
|
+
despite
|
79
|
+
down
|
80
|
+
due to
|
81
|
+
during
|
82
|
+
ere
|
83
|
+
except
|
84
|
+
except for
|
85
|
+
excluding
|
86
|
+
failing
|
87
|
+
far from
|
88
|
+
following
|
89
|
+
for
|
90
|
+
for the sake of
|
91
|
+
forby
|
92
|
+
forenenst
|
93
|
+
fornenst
|
94
|
+
fornent
|
95
|
+
from
|
96
|
+
fromward
|
97
|
+
froward
|
98
|
+
frowards
|
99
|
+
gainst
|
100
|
+
given
|
101
|
+
hence
|
102
|
+
in
|
103
|
+
in accordance with
|
104
|
+
in addition to
|
105
|
+
in case of
|
106
|
+
in front of
|
107
|
+
in lieu of
|
108
|
+
in order to
|
109
|
+
in place of
|
110
|
+
in point of
|
111
|
+
in re
|
112
|
+
in spite of
|
113
|
+
in to
|
114
|
+
including
|
115
|
+
inside
|
116
|
+
inside of
|
117
|
+
inside out
|
118
|
+
instead of
|
119
|
+
into
|
120
|
+
left of
|
121
|
+
like
|
122
|
+
mid
|
123
|
+
midst
|
124
|
+
minus
|
125
|
+
modulo
|
126
|
+
near
|
127
|
+
near to
|
128
|
+
neath
|
129
|
+
next
|
130
|
+
next to
|
131
|
+
notwithstanding
|
132
|
+
o'
|
133
|
+
of
|
134
|
+
off
|
135
|
+
on
|
136
|
+
on account of
|
137
|
+
on behalf of
|
138
|
+
on to
|
139
|
+
on top of
|
140
|
+
onto
|
141
|
+
opposite
|
142
|
+
opposite of
|
143
|
+
opposite to
|
144
|
+
out
|
145
|
+
out from
|
146
|
+
out of
|
147
|
+
outside
|
148
|
+
outside of
|
149
|
+
outwith
|
150
|
+
over
|
151
|
+
overthwart
|
152
|
+
owing to
|
153
|
+
pace
|
154
|
+
past
|
155
|
+
per
|
156
|
+
plus
|
157
|
+
prior to
|
158
|
+
pro
|
159
|
+
pursuant to
|
160
|
+
qua
|
161
|
+
rather than
|
162
|
+
re
|
163
|
+
regarding
|
164
|
+
regardless of
|
165
|
+
right of
|
166
|
+
round
|
167
|
+
sans
|
168
|
+
save
|
169
|
+
since
|
170
|
+
subsequent to
|
171
|
+
such as
|
172
|
+
than
|
173
|
+
thanks to
|
174
|
+
that of
|
175
|
+
through
|
176
|
+
throughout
|
177
|
+
till
|
178
|
+
times
|
179
|
+
to
|
180
|
+
tofore
|
181
|
+
toforn
|
182
|
+
toward
|
183
|
+
towards
|
184
|
+
under
|
185
|
+
underneath
|
186
|
+
unlike
|
187
|
+
until
|
188
|
+
unto
|
189
|
+
up
|
190
|
+
up to
|
191
|
+
upon
|
192
|
+
versus
|
193
|
+
via
|
194
|
+
vice
|
195
|
+
with
|
196
|
+
with a view to
|
197
|
+
with regard to
|
198
|
+
with respect to
|
199
|
+
withal
|
200
|
+
within
|
201
|
+
without
|
202
|
+
worth
|
data/config/pronouns.txt
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
I
|
2
|
+
he
|
3
|
+
her
|
4
|
+
hers
|
5
|
+
herself
|
6
|
+
him
|
7
|
+
himself
|
8
|
+
his
|
9
|
+
hisself
|
10
|
+
it
|
11
|
+
its
|
12
|
+
itself
|
13
|
+
me
|
14
|
+
mine
|
15
|
+
my
|
16
|
+
myself
|
17
|
+
one
|
18
|
+
one's
|
19
|
+
oneself
|
20
|
+
our
|
21
|
+
ours
|
22
|
+
ourself
|
23
|
+
ourselves
|
24
|
+
she
|
25
|
+
thee
|
26
|
+
their
|
27
|
+
theirs
|
28
|
+
theirself
|
29
|
+
theirselves
|
30
|
+
them
|
31
|
+
themself
|
32
|
+
themselves
|
33
|
+
they
|
34
|
+
thine
|
35
|
+
thou
|
36
|
+
thy
|
37
|
+
thyself
|
38
|
+
us
|
39
|
+
we
|
40
|
+
who
|
41
|
+
whom
|
42
|
+
whomself
|
43
|
+
whose
|
44
|
+
whoself
|
45
|
+
y'all
|
46
|
+
ye
|
47
|
+
you
|
48
|
+
you all
|
49
|
+
your
|
50
|
+
yours
|
51
|
+
yourself
|
52
|
+
yourselves
|
53
|
+
youse
|
data/lib/tagmemics.rb
CHANGED
@@ -1,35 +1,52 @@
|
|
1
1
|
require_relative './tagmemics/word'
|
2
2
|
require_relative './tagmemics/sentence'
|
3
|
+
require_relative './tagmemics/load_data'
|
3
4
|
|
4
|
-
|
5
|
-
module Lexicon
|
5
|
+
module Tagmemics
|
6
6
|
def self.parse(str)
|
7
|
-
|
7
|
+
WordSet.new(str)
|
8
8
|
end
|
9
|
-
end
|
10
9
|
|
11
|
-
# The output of
|
12
|
-
class
|
13
|
-
|
14
|
-
|
10
|
+
# The output of Tagmemics.parse
|
11
|
+
class WordSet
|
12
|
+
ARTICLES = %w(the an a)
|
13
|
+
CONJUNCTIONS = %w(for and nor but or yet so )
|
14
|
+
LINKING_VERBS = LoadData.contents_to_a('linking_verbs')
|
15
|
+
PRONOUNS = LoadData.contents_to_a('pronouns')
|
16
|
+
PREPOSITIONS = LoadData.contents_to_a('prepositions')
|
15
17
|
|
16
|
-
|
17
|
-
|
18
|
-
end
|
18
|
+
attr_accessor :nouns, :verbs, :articles, :adjectives, :adverbs,
|
19
|
+
:prepositions, :conjunctions, :pronouns, :collection
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
21
|
+
def initialize(str)
|
22
|
+
@collection = []
|
23
|
+
arr = WordSet.sentence_to_array(str)
|
24
|
+
arr.each { |word| @collection << Word.new(word) }
|
25
|
+
# @set = WordSet.start_hash(WordSet.sentence_to_array(str))
|
26
|
+
end
|
27
|
+
|
28
|
+
class << self
|
29
|
+
include LoadData
|
30
|
+
|
31
|
+
# Will probably want to use punctuation in the future.
|
32
|
+
# For now, this removes it.
|
33
|
+
def sentence_to_array(sentence)
|
34
|
+
sentence.split(/\s+|\W+\z/)
|
35
|
+
end
|
23
36
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
37
|
+
# Moved part of speech. This will not work right now.
|
38
|
+
# Probably need to delete this.
|
39
|
+
def start_hash(arr)
|
40
|
+
arr.map do |word|
|
41
|
+
result =
|
42
|
+
case
|
43
|
+
when part_of_speech(ARTICLES, word).any? then :article
|
44
|
+
when part_of_speech(CONJUNCTIONS, word).any? then :conjunction
|
45
|
+
when part_of_speech(PRONOUNS, word).any? then :pronoun
|
46
|
+
end
|
47
|
+
[word, result]
|
48
|
+
end.to_h
|
49
|
+
end
|
50
|
+
end
|
34
51
|
end
|
35
52
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# Retrieves data from config folder to save to constants.
|
2
|
-
module
|
2
|
+
module LoadData
|
3
3
|
def self.config_path
|
4
4
|
File.join(File.dirname(__FILE__), '../../config')
|
5
5
|
end
|
@@ -24,7 +24,7 @@ module Config
|
|
24
24
|
page = agent.get(uri)
|
25
25
|
destination = "./config/#{part_of_speech}.txt"
|
26
26
|
target = page.search(css_selector)
|
27
|
-
regx = /[
|
27
|
+
regx = /[^\047a-zA-Z\s]/ # \047 is an apostrophe
|
28
28
|
|
29
29
|
arr = []
|
30
30
|
target.each do |x|
|
data/lib/tagmemics/word.rb
CHANGED
@@ -1,69 +1,59 @@
|
|
1
1
|
require 'wordnet'
|
2
2
|
require 'facets'
|
3
|
-
require_relative './
|
3
|
+
require_relative './load_data'
|
4
4
|
require_relative './word/wordnet'
|
5
|
+
require_relative './word/confidence'
|
5
6
|
|
6
|
-
module
|
7
|
+
module Tagmemics
|
7
8
|
class Word
|
8
|
-
include Config
|
9
|
-
|
10
|
-
ARTICLES = %w(the an a)
|
11
|
-
CONJUNCTIONS = %w(for and nor but or yet so )
|
12
|
-
PRONOUNS = Config.contents_to_a('pronouns')
|
13
|
-
|
14
|
-
|
15
|
-
def part_of_speech(constant, str)
|
16
|
-
arr = []
|
17
|
-
constant.each do |word|
|
18
|
-
regx = /\b#{word}\b/i
|
19
|
-
arr << word if regx =~ str # word phrase matches
|
20
|
-
end
|
21
|
-
arr
|
22
|
-
end
|
23
|
-
|
24
|
-
def decimal_complete(hsh)
|
25
|
-
total = hsh.length
|
26
|
-
complete = hsh.count { |_k, v| v } # not nil
|
27
|
-
complete / total.to_f
|
28
|
-
end
|
29
|
-
|
30
9
|
def initialize(word)
|
31
|
-
@
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
def verb_confidence(str)
|
52
|
-
end
|
53
|
-
|
54
|
-
def adjective_confidence(str)
|
55
|
-
end
|
10
|
+
@str = word
|
11
|
+
puts "examining: #{word}"
|
12
|
+
@tagmemic_confidence = Word.confidence_levels(word)
|
13
|
+
end
|
14
|
+
|
15
|
+
class << self
|
16
|
+
##
|
17
|
+
# Because WordNet only tracks verbs, nouns, adverbs and adjectives,
|
18
|
+
# confidence levels can only be updated for those values. The other words
|
19
|
+
# such as pronouns, prepositions, and conjunctions are based off of list
|
20
|
+
# in config folder. Their score is pass or fail and is
|
21
|
+
# calculated as 0 or 1.0.
|
22
|
+
def confidence_levels(word)
|
23
|
+
word = word.downcase
|
24
|
+
known_hsh = determine_known_words(word)
|
25
|
+
wordnet_hsh = determine_wordnet_words(word)
|
26
|
+
|
27
|
+
hsh = everything_nil(known_hsh) ? wordnet_hsh : known_hsh
|
28
|
+
delete_nogos(hsh)
|
29
|
+
end
|
56
30
|
|
57
|
-
|
58
|
-
|
31
|
+
def determine_known_words(word)
|
32
|
+
{
|
33
|
+
:article => article_confidence(word),
|
34
|
+
:preposition => preposition_confidence(word),
|
35
|
+
:pronoun => pronoun_confidence(word),
|
36
|
+
:conjunction => conjunction_confidence(word),
|
37
|
+
:linking_verb => linking_verb_confidence(word)
|
38
|
+
}
|
39
|
+
end
|
59
40
|
|
60
|
-
|
61
|
-
|
41
|
+
def everything_nil(hsh)
|
42
|
+
(hsh.select { |_k, v| v != 0.0 && !v.nil? }.empty?)
|
43
|
+
end
|
62
44
|
|
63
|
-
|
64
|
-
|
45
|
+
def delete_nogos(hsh)
|
46
|
+
hsh.delete_if { |_k, v| v == 0.0 || v.nil? }
|
47
|
+
end
|
65
48
|
|
66
|
-
|
49
|
+
def determine_wordnet_words(word)
|
50
|
+
{
|
51
|
+
:noun => WordNetMethods.wordnet_probability(word, 'noun'),
|
52
|
+
:verb => WordNetMethods.wordnet_probability(word, 'verb'),
|
53
|
+
:adjective => WordNetMethods.wordnet_probability(word, 'adjective'),
|
54
|
+
:adverb => WordNetMethods.wordnet_probability(word, 'adverb')
|
55
|
+
}
|
56
|
+
end
|
67
57
|
end
|
68
58
|
end
|
69
59
|
end
|