llt-morphologizer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +8 -0
- data/Gemfile +27 -0
- data/LICENSE.txt +22 -0
- data/README.md +35 -0
- data/Rakefile +6 -0
- data/lib/llt/morphologizer.rb +378 -0
- data/lib/llt/morphologizer/lookup_statement.rb +66 -0
- data/lib/llt/morphologizer/stem_lookup_statement_builder.rb +130 -0
- data/lib/llt/morphologizer/stem_lookup_statement_builder/conjugable.rb +221 -0
- data/lib/llt/morphologizer/stem_lookup_statement_builder/contracted_forms.rb +38 -0
- data/lib/llt/morphologizer/stem_lookup_statement_builder/declinable.rb +214 -0
- data/lib/llt/morphologizer/version.rb +5 -0
- data/llt-morphologizer.gemspec +34 -0
- data/spec/lib/llt/morphologizer/lookup_statement_spec.rb +29 -0
- data/spec/lib/llt/morphologizer/stem_lookup_statement_builder_spec.rb +39 -0
- data/spec/lib/llt/morphologizer_spec.rb +524 -0
- data/spec/spec_helper.rb +27 -0
- metadata +235 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
module LLT
|
2
|
+
class Morphologizer
|
3
|
+
class LookupStatement
|
4
|
+
attr_reader :components
|
5
|
+
alias :options :components
|
6
|
+
|
7
|
+
def initialize(*args)
|
8
|
+
@stem, @table, @column, @itypes, @components = args
|
9
|
+
safety_clones
|
10
|
+
end
|
11
|
+
|
12
|
+
def stem_type
|
13
|
+
@column
|
14
|
+
end
|
15
|
+
|
16
|
+
def type
|
17
|
+
@table
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_query
|
21
|
+
{
|
22
|
+
type: @table,
|
23
|
+
stem: @stem,
|
24
|
+
stem_type: @column,
|
25
|
+
restrictions: build_restrictions
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_s
|
30
|
+
"Looking up #{@stem.light_green} as #{@table}, #{@column} #{"with #{components_to_s}" if @components.any? } (classes: #{@itypes * ", "})"
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
# The methods that help in the creation of such instances are
|
36
|
+
# prepending and appending strings - especially the thematic.
|
37
|
+
# Just to be safe, clones this value.
|
38
|
+
def safety_clones
|
39
|
+
if thematic = @components[:thematic]
|
40
|
+
@components[:thematic] = thematic.clone
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def components_to_s
|
45
|
+
@components.map do |k, v|
|
46
|
+
val = (v.empty? ? '""' : v)
|
47
|
+
"#{k} #{val.to_s.cyan}"
|
48
|
+
end.compact * ", "
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
def build_restrictions
|
53
|
+
kw = if @itypes.all? { |x| x.kind_of? Fixnum }
|
54
|
+
:inflection_class
|
55
|
+
else
|
56
|
+
:pf_composition
|
57
|
+
end
|
58
|
+
|
59
|
+
{
|
60
|
+
type: kw,
|
61
|
+
values: @itypes
|
62
|
+
}
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
module LLT
|
2
|
+
class Morphologizer
|
3
|
+
class StemLookupStatementBuilder
|
4
|
+
|
5
|
+
require 'llt/morphologizer/stem_lookup_statement_builder/contracted_forms'
|
6
|
+
require 'llt/morphologizer/stem_lookup_statement_builder/declinable'
|
7
|
+
require 'llt/morphologizer/stem_lookup_statement_builder/conjugable'
|
8
|
+
require 'llt/morphologizer/lookup_statement'
|
9
|
+
|
10
|
+
include Declinable
|
11
|
+
include Conjugable
|
12
|
+
|
13
|
+
def initialize(word, log)
|
14
|
+
@word = word.clone # clone! because this will get sliced and reset continuously in this class
|
15
|
+
@log = log
|
16
|
+
|
17
|
+
@components = Hash.new { |h, k| h[k] = "" }
|
18
|
+
@lookup = {}
|
19
|
+
end
|
20
|
+
|
21
|
+
def stem
|
22
|
+
# a semantic help
|
23
|
+
@word
|
24
|
+
end
|
25
|
+
|
26
|
+
GETTER_METHODS = { components: %w{ thematic extension comparison_sign ending contraction },
|
27
|
+
lookup: %w{ table column itype } }
|
28
|
+
|
29
|
+
GETTER_METHODS.each do |inst_var, methods|
|
30
|
+
methods.each do |method|
|
31
|
+
class_eval <<-STR
|
32
|
+
def #{method}
|
33
|
+
@#{inst_var}[:#{method}]
|
34
|
+
end
|
35
|
+
STR
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def statements
|
40
|
+
@statements = []
|
41
|
+
create_declinables
|
42
|
+
create_conjugables
|
43
|
+
|
44
|
+
@statements
|
45
|
+
end
|
46
|
+
|
47
|
+
def setup(operator)
|
48
|
+
@components.clear
|
49
|
+
@lookup = { table: "", column: "", itype: [] }
|
50
|
+
@operator = operator
|
51
|
+
end
|
52
|
+
|
53
|
+
def reset(*args)
|
54
|
+
args.flatten.each do |comp|
|
55
|
+
@word << @components.delete(comp).to_s
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def all
|
60
|
+
@all_memo ||= %i{ thematic extension comparison_sign ending }
|
61
|
+
end
|
62
|
+
|
63
|
+
def has(arr)
|
64
|
+
type, components = arr
|
65
|
+
if result = scan(components, type)
|
66
|
+
slice_and_stash(type, result)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def scan(components, type)
|
71
|
+
# look what that's doing, it's a bit weird
|
72
|
+
components.flat_map {|x| @word.scan(x) }.first # that's brutally ugly
|
73
|
+
end
|
74
|
+
|
75
|
+
def slice_and_stash(type, result)
|
76
|
+
@components[type].prepend(@word.slice!(/#{result}$/))
|
77
|
+
end
|
78
|
+
|
79
|
+
def look_for the_table = :same, the_column = :same
|
80
|
+
unless the_table == :same
|
81
|
+
@lookup[:table] = the_table
|
82
|
+
@lookup[:column] = the_column
|
83
|
+
end
|
84
|
+
|
85
|
+
send("valid_itypes_for_#{@operator}")
|
86
|
+
add_statement!
|
87
|
+
add_additional_persona_place_or_ethnic_statement!
|
88
|
+
itype.clear
|
89
|
+
end
|
90
|
+
|
91
|
+
def add_statement!
|
92
|
+
if itype.empty?
|
93
|
+
@log.warning("#{stem} with #{@components[:ending]} has no searchable infl classes.")
|
94
|
+
else
|
95
|
+
# 2013-09-27 19:23 @components.clone substituted with rejection of empty strings - observe if this leads to trouble.
|
96
|
+
st = LookupStatement.new(cloned_stem, table, column, itype.clone, unemptied_components)
|
97
|
+
@statements << st
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def unemptied_components
|
102
|
+
# leave ending always in - otherwise some words trigger build all forms (cf ita)
|
103
|
+
@components.reject { |k, v| v.empty? unless k == :ending }
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
def cloned_stem
|
108
|
+
s = stem.clone
|
109
|
+
s.downcase! unless persona_place_or_ethnic?
|
110
|
+
s
|
111
|
+
end
|
112
|
+
|
113
|
+
def persona_place_or_ethnic?
|
114
|
+
table == :persona || table == :place || table == :ethnic
|
115
|
+
end
|
116
|
+
|
117
|
+
def add_additional_persona_place_or_ethnic_statement!
|
118
|
+
if stem.match(/^[A-Z].*/)
|
119
|
+
case table
|
120
|
+
when :noun
|
121
|
+
@lookup[:table] = :persona and add_statement!
|
122
|
+
@lookup[:table] = :place and add_statement!
|
123
|
+
when :adjective
|
124
|
+
@lookup[:table] = :ethnic and add_statement! if column == :stem
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
module LLT::Morphologizer::StemLookupStatementBuilder::Conjugable
|
2
|
+
include LLT::Morphologizer::StemLookupStatementBuilder::ContractedForms
|
3
|
+
|
4
|
+
class << self
|
5
|
+
def sg_1_active
|
6
|
+
[/(?<!tud|[^nu][st]i|ment|[bc]ul|[ao]ri|\Apr)o$/, /(?<![^s]u)m$/]
|
7
|
+
end
|
8
|
+
|
9
|
+
def sg_2_active
|
10
|
+
/(?<=[aer]|[^tr]i|[^aeirsl]ti|[^ai]ri|[^t][sft]eri|quiri|quaeri|\A[a-z]peri|[^a-z]geri|[^a-z]pari|[^a-z]meti)s$/
|
11
|
+
end
|
12
|
+
|
13
|
+
def sg_3_active
|
14
|
+
/(?<=\S[ae]|[is])t$/
|
15
|
+
end
|
16
|
+
|
17
|
+
def pl_1_active
|
18
|
+
/(?<!illi|erri|ssi|[^aeiu])mus$/
|
19
|
+
end
|
20
|
+
|
21
|
+
def pl_2_active
|
22
|
+
/(?<=[aei])tis$/
|
23
|
+
end
|
24
|
+
|
25
|
+
def pl_3_active
|
26
|
+
/(?<=[aeiu])nt$/
|
27
|
+
end
|
28
|
+
|
29
|
+
def sg_1_passive
|
30
|
+
[/(?<!u)or$/, /(?<!u)r$/]
|
31
|
+
end
|
32
|
+
|
33
|
+
def sg_2_passive
|
34
|
+
/(?<=[^p]a|[^afgtpsx]e|[^(qu)]i|[cr][uia]pe|[a-z][tg]e)ris$/
|
35
|
+
end
|
36
|
+
|
37
|
+
def sg_3_passive
|
38
|
+
/(?<=[aei])tur$/
|
39
|
+
end
|
40
|
+
|
41
|
+
def pl_1_passive
|
42
|
+
/mur$/
|
43
|
+
end
|
44
|
+
|
45
|
+
def pl_2_passive
|
46
|
+
/(?<=[aei])mini$/
|
47
|
+
end
|
48
|
+
|
49
|
+
def pl_3_passive
|
50
|
+
/(?<=[aeiu])ntur$/
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
PRIMARY_ENDING = [:ending, [ *sg_1_active, sg_2_active, sg_3_active, pl_1_active, pl_2_active, pl_3_active,
|
55
|
+
*sg_1_passive, sg_3_passive, pl_1_passive, pl_2_passive, pl_3_passive]]
|
56
|
+
PRIMARY_ENDING_SG_2_PASSIVE = [:ending, [sg_2_passive]]
|
57
|
+
|
58
|
+
SECONDARY_ENDING = [:ending, [ /isti$/, /(?<=[^rnt])i$/, /it$/, /imus$/, /istis$/, /erunt$/, /ere$/ ]]
|
59
|
+
IMPERATIVE_ENDING = [:ending, [ /(?<=[aei])te$/, /tote$/, /(?<=[^ieu]a$|e$|[^min][^uv]i$)/, /(?<=[^n])to$/, /nto$/,
|
60
|
+
/(?<=[^n])tor$/, /ntor$/]]
|
61
|
+
DEP_IMP_ENDING = [:ending, [ /(?<=[aei])re$/ ]]
|
62
|
+
|
63
|
+
PERFECT_EXTENSIONS = [:extension, [/er$|er[ai]$|isse$/]]
|
64
|
+
IMPERFECT_BA = [:extension, [/ba$/]]
|
65
|
+
FUTURE_B = [:extension, [/[b]$/]]
|
66
|
+
FUTURE_OR_SUBJUNCTIVE_A_OR_E = [:extension, [/[ae]$/]]
|
67
|
+
SUBJUNCTIVE_IMPERFECT = [:extension, [/re$/]]
|
68
|
+
|
69
|
+
THEMATIC_VOWEL = [:thematic, [/[eiu]$/]]
|
70
|
+
THEMATIC_I_OF_M = [:thematic, [/[i]$/]]
|
71
|
+
THEMATIC_E_OF_SUBJUNCTIVE_IMPERFECT = [:thematic, [/e$/]]
|
72
|
+
|
73
|
+
# (?<=[aei])re not needed here as inf pr, - the dep_imp_ending finds it
|
74
|
+
# anyway, the FormBuilder cares for the rest.
|
75
|
+
INFINITIVE_PR = [:ending, [/(?<=[aei])ri$|(?<=[^aeior])i$|r?ier$/]]
|
76
|
+
INFINITIVE_PF = [:ending, [/isse$/]]
|
77
|
+
|
78
|
+
def create_conjugables
|
79
|
+
setup(:conjugable)
|
80
|
+
search_for_contracted_form(:conjugable_search)
|
81
|
+
|
82
|
+
setup(:conjugable)
|
83
|
+
conjugable_search
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
def conjugable_search
|
88
|
+
secondary_ending
|
89
|
+
primary_ending(PRIMARY_ENDING)
|
90
|
+
primary_ending(PRIMARY_ENDING_SG_2_PASSIVE)
|
91
|
+
imperative
|
92
|
+
infinitive
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
def secondary_ending
|
98
|
+
if has SECONDARY_ENDING
|
99
|
+
look_for :verb, :pf
|
100
|
+
reset all
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def primary_ending(const)
|
105
|
+
if has const then look_for :verb, :pr
|
106
|
+
|
107
|
+
if has IMPERFECT_BA then look_for :verb, :pr
|
108
|
+
if has THEMATIC_VOWEL then look_for :same; end
|
109
|
+
if has THEMATIC_I_OF_M then look_for :same; end
|
110
|
+
reset :thematic, :extension
|
111
|
+
end
|
112
|
+
|
113
|
+
if has THEMATIC_VOWEL then look_for :same
|
114
|
+
if has THEMATIC_I_OF_M then look_for :same; end
|
115
|
+
if has FUTURE_B
|
116
|
+
look_for :same
|
117
|
+
reset :extension, :thematic
|
118
|
+
else reset :thematic, :extension
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
if has FUTURE_B then look_for :same
|
123
|
+
reset :thematic, :extension
|
124
|
+
end
|
125
|
+
|
126
|
+
if has FUTURE_OR_SUBJUNCTIVE_A_OR_E then look_for :same
|
127
|
+
subjunctive_present_of_A_conjugation
|
128
|
+
if has THEMATIC_I_OF_M then look_for :same; end
|
129
|
+
reset :thematic, :extension
|
130
|
+
end
|
131
|
+
|
132
|
+
if has SUBJUNCTIVE_IMPERFECT then look_for :same
|
133
|
+
if has THEMATIC_E_OF_SUBJUNCTIVE_IMPERFECT then look_for :same; end
|
134
|
+
end
|
135
|
+
|
136
|
+
first_person_present_of_A_conjugation
|
137
|
+
|
138
|
+
reset :thematic, :extension
|
139
|
+
|
140
|
+
if has PERFECT_EXTENSIONS
|
141
|
+
look_for :verb, :pf
|
142
|
+
end
|
143
|
+
|
144
|
+
reset all
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def imperative
|
149
|
+
unless short_imperative
|
150
|
+
if has IMPERATIVE_ENDING then look_for :verb, :pr
|
151
|
+
if has THEMATIC_VOWEL then look_for :same
|
152
|
+
if has THEMATIC_I_OF_M then look_for :same; end
|
153
|
+
end
|
154
|
+
reset all
|
155
|
+
end
|
156
|
+
|
157
|
+
if has DEP_IMP_ENDING then look_for :verb, :pr
|
158
|
+
if has THEMATIC_VOWEL then look_for :same
|
159
|
+
if has THEMATIC_I_OF_M then look_for :same; end
|
160
|
+
end
|
161
|
+
reset all
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def infinitive
|
167
|
+
if has INFINITIVE_PR
|
168
|
+
look_for :verb, :pr
|
169
|
+
if has THEMATIC_VOWEL
|
170
|
+
look_for :same
|
171
|
+
end
|
172
|
+
reset all
|
173
|
+
end
|
174
|
+
|
175
|
+
if has INFINITIVE_PF
|
176
|
+
look_for :verb, :pf
|
177
|
+
reset all
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def subjunctive_present_of_A_conjugation
|
182
|
+
append_a_search_and_chop_it if extension == "e" && stem !~ /i$/
|
183
|
+
end
|
184
|
+
|
185
|
+
def first_person_present_of_A_conjugation
|
186
|
+
append_a_search_and_chop_it if ending =~ /(o|or)$/ && extension.empty? # laudavero
|
187
|
+
end
|
188
|
+
|
189
|
+
def append_a_search_and_chop_it
|
190
|
+
stem << "a"
|
191
|
+
look_for :same
|
192
|
+
stem.chop!
|
193
|
+
end
|
194
|
+
|
195
|
+
def short_imperative
|
196
|
+
if stem =~ /dic$|duc$|fac$|fer$/
|
197
|
+
look_for :verb, :pr # had return true before, but look_for should return true anyway
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def valid_itypes_for_conjugable
|
202
|
+
if column == :pr
|
203
|
+
itype << 1 if stem =~ /a$/
|
204
|
+
itype << 2 if stem =~ /e$/
|
205
|
+
itype << 3 if stem =~ /[^aeio]$/ && thematic != "iu"
|
206
|
+
itype << 4 if stem =~ /i$/
|
207
|
+
itype << 5 if stem =~ /[^aeio]$/ && thematic != "u"
|
208
|
+
end
|
209
|
+
|
210
|
+
if column == :pf
|
211
|
+
itype << "v" if stem =~ /v$/
|
212
|
+
itype << "u" if stem =~ /u$/
|
213
|
+
itype << "s" if stem =~ /s$|x$/
|
214
|
+
itype << "else" if stem !~ /v$|u$|s$|x$/
|
215
|
+
itype << "ablaut" if stem !~ /v$|u$|s$|x$/
|
216
|
+
# regexps needed
|
217
|
+
itype << "reduplication" if stem
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module LLT::Morphologizer::StemLookupStatementBuilder::ContractedForms
|
2
|
+
|
3
|
+
CONTRACTED_FORMS = {
|
4
|
+
"v" => /(?<=[^v]i)(er[aiu]nt|er[ia][mst]|er[ia]mus|er[ia]tis|ero)$/,
|
5
|
+
"vi" => /(?<=[^v][aeio])(stis?|sse[mst]|ssemus|ssetis|ssent|sse)$/,
|
6
|
+
"ve" => /(?<=[^v][aeo])(r[aiu]nt|r[ia][mst]|r[ia]mus|r[ia]tis|ro)$/
|
7
|
+
}
|
8
|
+
|
9
|
+
|
10
|
+
def search_for_contracted_form(method)
|
11
|
+
CONTRACTED_FORMS.each do |missing_piece, regexp|
|
12
|
+
index = @word =~ regexp
|
13
|
+
unless index.nil?
|
14
|
+
@word.insert(index, missing_piece)
|
15
|
+
@components[:contraction] = Contraction.new(index, missing_piece)
|
16
|
+
|
17
|
+
send(method)
|
18
|
+
@word.slice!(index, 2)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class Contraction
|
24
|
+
def initialize(position, contraction)
|
25
|
+
@position = position
|
26
|
+
@contraction = contraction
|
27
|
+
end
|
28
|
+
|
29
|
+
def empty?
|
30
|
+
# duck type, fulfilling the contract of the other component strings
|
31
|
+
false
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
"#{@contraction} contracted at #{@position}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|