llt-morphologizer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +8 -0
- data/Gemfile +27 -0
- data/LICENSE.txt +22 -0
- data/README.md +35 -0
- data/Rakefile +6 -0
- data/lib/llt/morphologizer.rb +378 -0
- data/lib/llt/morphologizer/lookup_statement.rb +66 -0
- data/lib/llt/morphologizer/stem_lookup_statement_builder.rb +130 -0
- data/lib/llt/morphologizer/stem_lookup_statement_builder/conjugable.rb +221 -0
- data/lib/llt/morphologizer/stem_lookup_statement_builder/contracted_forms.rb +38 -0
- data/lib/llt/morphologizer/stem_lookup_statement_builder/declinable.rb +214 -0
- data/lib/llt/morphologizer/version.rb +5 -0
- data/llt-morphologizer.gemspec +34 -0
- data/spec/lib/llt/morphologizer/lookup_statement_spec.rb +29 -0
- data/spec/lib/llt/morphologizer/stem_lookup_statement_builder_spec.rb +39 -0
- data/spec/lib/llt/morphologizer_spec.rb +524 -0
- data/spec/spec_helper.rb +27 -0
- metadata +235 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
module LLT
|
2
|
+
class Morphologizer
|
3
|
+
class LookupStatement
|
4
|
+
attr_reader :components
|
5
|
+
alias :options :components
|
6
|
+
|
7
|
+
def initialize(*args)
|
8
|
+
@stem, @table, @column, @itypes, @components = args
|
9
|
+
safety_clones
|
10
|
+
end
|
11
|
+
|
12
|
+
def stem_type
|
13
|
+
@column
|
14
|
+
end
|
15
|
+
|
16
|
+
def type
|
17
|
+
@table
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_query
|
21
|
+
{
|
22
|
+
type: @table,
|
23
|
+
stem: @stem,
|
24
|
+
stem_type: @column,
|
25
|
+
restrictions: build_restrictions
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_s
|
30
|
+
"Looking up #{@stem.light_green} as #{@table}, #{@column} #{"with #{components_to_s}" if @components.any? } (classes: #{@itypes * ", "})"
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
# The methods that help in the creation of such instances are
|
36
|
+
# prepending and appending strings - especially the thematic.
|
37
|
+
# Just to be safe, clones this value.
|
38
|
+
def safety_clones
|
39
|
+
if thematic = @components[:thematic]
|
40
|
+
@components[:thematic] = thematic.clone
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def components_to_s
|
45
|
+
@components.map do |k, v|
|
46
|
+
val = (v.empty? ? '""' : v)
|
47
|
+
"#{k} #{val.to_s.cyan}"
|
48
|
+
end.compact * ", "
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
def build_restrictions
|
53
|
+
kw = if @itypes.all? { |x| x.kind_of? Fixnum }
|
54
|
+
:inflection_class
|
55
|
+
else
|
56
|
+
:pf_composition
|
57
|
+
end
|
58
|
+
|
59
|
+
{
|
60
|
+
type: kw,
|
61
|
+
values: @itypes
|
62
|
+
}
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
module LLT
|
2
|
+
class Morphologizer
|
3
|
+
class StemLookupStatementBuilder
|
4
|
+
|
5
|
+
require 'llt/morphologizer/stem_lookup_statement_builder/contracted_forms'
|
6
|
+
require 'llt/morphologizer/stem_lookup_statement_builder/declinable'
|
7
|
+
require 'llt/morphologizer/stem_lookup_statement_builder/conjugable'
|
8
|
+
require 'llt/morphologizer/lookup_statement'
|
9
|
+
|
10
|
+
include Declinable
|
11
|
+
include Conjugable
|
12
|
+
|
13
|
+
def initialize(word, log)
|
14
|
+
@word = word.clone # clone! because this will get sliced and reset continuously in this class
|
15
|
+
@log = log
|
16
|
+
|
17
|
+
@components = Hash.new { |h, k| h[k] = "" }
|
18
|
+
@lookup = {}
|
19
|
+
end
|
20
|
+
|
21
|
+
def stem
|
22
|
+
# a semantic help
|
23
|
+
@word
|
24
|
+
end
|
25
|
+
|
26
|
+
GETTER_METHODS = { components: %w{ thematic extension comparison_sign ending contraction },
|
27
|
+
lookup: %w{ table column itype } }
|
28
|
+
|
29
|
+
GETTER_METHODS.each do |inst_var, methods|
|
30
|
+
methods.each do |method|
|
31
|
+
class_eval <<-STR
|
32
|
+
def #{method}
|
33
|
+
@#{inst_var}[:#{method}]
|
34
|
+
end
|
35
|
+
STR
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def statements
|
40
|
+
@statements = []
|
41
|
+
create_declinables
|
42
|
+
create_conjugables
|
43
|
+
|
44
|
+
@statements
|
45
|
+
end
|
46
|
+
|
47
|
+
def setup(operator)
|
48
|
+
@components.clear
|
49
|
+
@lookup = { table: "", column: "", itype: [] }
|
50
|
+
@operator = operator
|
51
|
+
end
|
52
|
+
|
53
|
+
def reset(*args)
|
54
|
+
args.flatten.each do |comp|
|
55
|
+
@word << @components.delete(comp).to_s
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def all
|
60
|
+
@all_memo ||= %i{ thematic extension comparison_sign ending }
|
61
|
+
end
|
62
|
+
|
63
|
+
def has(arr)
|
64
|
+
type, components = arr
|
65
|
+
if result = scan(components, type)
|
66
|
+
slice_and_stash(type, result)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def scan(components, type)
|
71
|
+
# look what that's doing, it's a bit weird
|
72
|
+
components.flat_map {|x| @word.scan(x) }.first # that's brutally ugly
|
73
|
+
end
|
74
|
+
|
75
|
+
def slice_and_stash(type, result)
|
76
|
+
@components[type].prepend(@word.slice!(/#{result}$/))
|
77
|
+
end
|
78
|
+
|
79
|
+
def look_for the_table = :same, the_column = :same
|
80
|
+
unless the_table == :same
|
81
|
+
@lookup[:table] = the_table
|
82
|
+
@lookup[:column] = the_column
|
83
|
+
end
|
84
|
+
|
85
|
+
send("valid_itypes_for_#{@operator}")
|
86
|
+
add_statement!
|
87
|
+
add_additional_persona_place_or_ethnic_statement!
|
88
|
+
itype.clear
|
89
|
+
end
|
90
|
+
|
91
|
+
def add_statement!
|
92
|
+
if itype.empty?
|
93
|
+
@log.warning("#{stem} with #{@components[:ending]} has no searchable infl classes.")
|
94
|
+
else
|
95
|
+
# 2013-09-27 19:23 @components.clone substituted with rejection of empty strings - observe if this leads to trouble.
|
96
|
+
st = LookupStatement.new(cloned_stem, table, column, itype.clone, unemptied_components)
|
97
|
+
@statements << st
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def unemptied_components
|
102
|
+
# leave ending always in - otherwise some words trigger build all forms (cf ita)
|
103
|
+
@components.reject { |k, v| v.empty? unless k == :ending }
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
def cloned_stem
|
108
|
+
s = stem.clone
|
109
|
+
s.downcase! unless persona_place_or_ethnic?
|
110
|
+
s
|
111
|
+
end
|
112
|
+
|
113
|
+
def persona_place_or_ethnic?
|
114
|
+
table == :persona || table == :place || table == :ethnic
|
115
|
+
end
|
116
|
+
|
117
|
+
def add_additional_persona_place_or_ethnic_statement!
|
118
|
+
if stem.match(/^[A-Z].*/)
|
119
|
+
case table
|
120
|
+
when :noun
|
121
|
+
@lookup[:table] = :persona and add_statement!
|
122
|
+
@lookup[:table] = :place and add_statement!
|
123
|
+
when :adjective
|
124
|
+
@lookup[:table] = :ethnic and add_statement! if column == :stem
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
module LLT::Morphologizer::StemLookupStatementBuilder::Conjugable
|
2
|
+
include LLT::Morphologizer::StemLookupStatementBuilder::ContractedForms
|
3
|
+
|
4
|
+
class << self
|
5
|
+
def sg_1_active
|
6
|
+
[/(?<!tud|[^nu][st]i|ment|[bc]ul|[ao]ri|\Apr)o$/, /(?<![^s]u)m$/]
|
7
|
+
end
|
8
|
+
|
9
|
+
def sg_2_active
|
10
|
+
/(?<=[aer]|[^tr]i|[^aeirsl]ti|[^ai]ri|[^t][sft]eri|quiri|quaeri|\A[a-z]peri|[^a-z]geri|[^a-z]pari|[^a-z]meti)s$/
|
11
|
+
end
|
12
|
+
|
13
|
+
def sg_3_active
|
14
|
+
/(?<=\S[ae]|[is])t$/
|
15
|
+
end
|
16
|
+
|
17
|
+
def pl_1_active
|
18
|
+
/(?<!illi|erri|ssi|[^aeiu])mus$/
|
19
|
+
end
|
20
|
+
|
21
|
+
def pl_2_active
|
22
|
+
/(?<=[aei])tis$/
|
23
|
+
end
|
24
|
+
|
25
|
+
def pl_3_active
|
26
|
+
/(?<=[aeiu])nt$/
|
27
|
+
end
|
28
|
+
|
29
|
+
def sg_1_passive
|
30
|
+
[/(?<!u)or$/, /(?<!u)r$/]
|
31
|
+
end
|
32
|
+
|
33
|
+
def sg_2_passive
|
34
|
+
/(?<=[^p]a|[^afgtpsx]e|[^(qu)]i|[cr][uia]pe|[a-z][tg]e)ris$/
|
35
|
+
end
|
36
|
+
|
37
|
+
def sg_3_passive
|
38
|
+
/(?<=[aei])tur$/
|
39
|
+
end
|
40
|
+
|
41
|
+
def pl_1_passive
|
42
|
+
/mur$/
|
43
|
+
end
|
44
|
+
|
45
|
+
def pl_2_passive
|
46
|
+
/(?<=[aei])mini$/
|
47
|
+
end
|
48
|
+
|
49
|
+
def pl_3_passive
|
50
|
+
/(?<=[aeiu])ntur$/
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
PRIMARY_ENDING = [:ending, [ *sg_1_active, sg_2_active, sg_3_active, pl_1_active, pl_2_active, pl_3_active,
|
55
|
+
*sg_1_passive, sg_3_passive, pl_1_passive, pl_2_passive, pl_3_passive]]
|
56
|
+
PRIMARY_ENDING_SG_2_PASSIVE = [:ending, [sg_2_passive]]
|
57
|
+
|
58
|
+
SECONDARY_ENDING = [:ending, [ /isti$/, /(?<=[^rnt])i$/, /it$/, /imus$/, /istis$/, /erunt$/, /ere$/ ]]
|
59
|
+
IMPERATIVE_ENDING = [:ending, [ /(?<=[aei])te$/, /tote$/, /(?<=[^ieu]a$|e$|[^min][^uv]i$)/, /(?<=[^n])to$/, /nto$/,
|
60
|
+
/(?<=[^n])tor$/, /ntor$/]]
|
61
|
+
DEP_IMP_ENDING = [:ending, [ /(?<=[aei])re$/ ]]
|
62
|
+
|
63
|
+
PERFECT_EXTENSIONS = [:extension, [/er$|er[ai]$|isse$/]]
|
64
|
+
IMPERFECT_BA = [:extension, [/ba$/]]
|
65
|
+
FUTURE_B = [:extension, [/[b]$/]]
|
66
|
+
FUTURE_OR_SUBJUNCTIVE_A_OR_E = [:extension, [/[ae]$/]]
|
67
|
+
SUBJUNCTIVE_IMPERFECT = [:extension, [/re$/]]
|
68
|
+
|
69
|
+
THEMATIC_VOWEL = [:thematic, [/[eiu]$/]]
|
70
|
+
THEMATIC_I_OF_M = [:thematic, [/[i]$/]]
|
71
|
+
THEMATIC_E_OF_SUBJUNCTIVE_IMPERFECT = [:thematic, [/e$/]]
|
72
|
+
|
73
|
+
# (?<=[aei])re not needed here as inf pr, - the dep_imp_ending finds it
|
74
|
+
# anyway, the FormBuilder cares for the rest.
|
75
|
+
INFINITIVE_PR = [:ending, [/(?<=[aei])ri$|(?<=[^aeior])i$|r?ier$/]]
|
76
|
+
INFINITIVE_PF = [:ending, [/isse$/]]
|
77
|
+
|
78
|
+
def create_conjugables
|
79
|
+
setup(:conjugable)
|
80
|
+
search_for_contracted_form(:conjugable_search)
|
81
|
+
|
82
|
+
setup(:conjugable)
|
83
|
+
conjugable_search
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
def conjugable_search
|
88
|
+
secondary_ending
|
89
|
+
primary_ending(PRIMARY_ENDING)
|
90
|
+
primary_ending(PRIMARY_ENDING_SG_2_PASSIVE)
|
91
|
+
imperative
|
92
|
+
infinitive
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
def secondary_ending
|
98
|
+
if has SECONDARY_ENDING
|
99
|
+
look_for :verb, :pf
|
100
|
+
reset all
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def primary_ending(const)
|
105
|
+
if has const then look_for :verb, :pr
|
106
|
+
|
107
|
+
if has IMPERFECT_BA then look_for :verb, :pr
|
108
|
+
if has THEMATIC_VOWEL then look_for :same; end
|
109
|
+
if has THEMATIC_I_OF_M then look_for :same; end
|
110
|
+
reset :thematic, :extension
|
111
|
+
end
|
112
|
+
|
113
|
+
if has THEMATIC_VOWEL then look_for :same
|
114
|
+
if has THEMATIC_I_OF_M then look_for :same; end
|
115
|
+
if has FUTURE_B
|
116
|
+
look_for :same
|
117
|
+
reset :extension, :thematic
|
118
|
+
else reset :thematic, :extension
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
if has FUTURE_B then look_for :same
|
123
|
+
reset :thematic, :extension
|
124
|
+
end
|
125
|
+
|
126
|
+
if has FUTURE_OR_SUBJUNCTIVE_A_OR_E then look_for :same
|
127
|
+
subjunctive_present_of_A_conjugation
|
128
|
+
if has THEMATIC_I_OF_M then look_for :same; end
|
129
|
+
reset :thematic, :extension
|
130
|
+
end
|
131
|
+
|
132
|
+
if has SUBJUNCTIVE_IMPERFECT then look_for :same
|
133
|
+
if has THEMATIC_E_OF_SUBJUNCTIVE_IMPERFECT then look_for :same; end
|
134
|
+
end
|
135
|
+
|
136
|
+
first_person_present_of_A_conjugation
|
137
|
+
|
138
|
+
reset :thematic, :extension
|
139
|
+
|
140
|
+
if has PERFECT_EXTENSIONS
|
141
|
+
look_for :verb, :pf
|
142
|
+
end
|
143
|
+
|
144
|
+
reset all
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def imperative
|
149
|
+
unless short_imperative
|
150
|
+
if has IMPERATIVE_ENDING then look_for :verb, :pr
|
151
|
+
if has THEMATIC_VOWEL then look_for :same
|
152
|
+
if has THEMATIC_I_OF_M then look_for :same; end
|
153
|
+
end
|
154
|
+
reset all
|
155
|
+
end
|
156
|
+
|
157
|
+
if has DEP_IMP_ENDING then look_for :verb, :pr
|
158
|
+
if has THEMATIC_VOWEL then look_for :same
|
159
|
+
if has THEMATIC_I_OF_M then look_for :same; end
|
160
|
+
end
|
161
|
+
reset all
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def infinitive
|
167
|
+
if has INFINITIVE_PR
|
168
|
+
look_for :verb, :pr
|
169
|
+
if has THEMATIC_VOWEL
|
170
|
+
look_for :same
|
171
|
+
end
|
172
|
+
reset all
|
173
|
+
end
|
174
|
+
|
175
|
+
if has INFINITIVE_PF
|
176
|
+
look_for :verb, :pf
|
177
|
+
reset all
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def subjunctive_present_of_A_conjugation
|
182
|
+
append_a_search_and_chop_it if extension == "e" && stem !~ /i$/
|
183
|
+
end
|
184
|
+
|
185
|
+
def first_person_present_of_A_conjugation
|
186
|
+
append_a_search_and_chop_it if ending =~ /(o|or)$/ && extension.empty? # laudavero
|
187
|
+
end
|
188
|
+
|
189
|
+
def append_a_search_and_chop_it
|
190
|
+
stem << "a"
|
191
|
+
look_for :same
|
192
|
+
stem.chop!
|
193
|
+
end
|
194
|
+
|
195
|
+
def short_imperative
|
196
|
+
if stem =~ /dic$|duc$|fac$|fer$/
|
197
|
+
look_for :verb, :pr # had return true before, but look_for should return true anyway
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def valid_itypes_for_conjugable
|
202
|
+
if column == :pr
|
203
|
+
itype << 1 if stem =~ /a$/
|
204
|
+
itype << 2 if stem =~ /e$/
|
205
|
+
itype << 3 if stem =~ /[^aeio]$/ && thematic != "iu"
|
206
|
+
itype << 4 if stem =~ /i$/
|
207
|
+
itype << 5 if stem =~ /[^aeio]$/ && thematic != "u"
|
208
|
+
end
|
209
|
+
|
210
|
+
if column == :pf
|
211
|
+
itype << "v" if stem =~ /v$/
|
212
|
+
itype << "u" if stem =~ /u$/
|
213
|
+
itype << "s" if stem =~ /s$|x$/
|
214
|
+
itype << "else" if stem !~ /v$|u$|s$|x$/
|
215
|
+
itype << "ablaut" if stem !~ /v$|u$|s$|x$/
|
216
|
+
# regexps needed
|
217
|
+
itype << "reduplication" if stem
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module LLT::Morphologizer::StemLookupStatementBuilder::ContractedForms
|
2
|
+
|
3
|
+
CONTRACTED_FORMS = {
|
4
|
+
"v" => /(?<=[^v]i)(er[aiu]nt|er[ia][mst]|er[ia]mus|er[ia]tis|ero)$/,
|
5
|
+
"vi" => /(?<=[^v][aeio])(stis?|sse[mst]|ssemus|ssetis|ssent|sse)$/,
|
6
|
+
"ve" => /(?<=[^v][aeo])(r[aiu]nt|r[ia][mst]|r[ia]mus|r[ia]tis|ro)$/
|
7
|
+
}
|
8
|
+
|
9
|
+
|
10
|
+
def search_for_contracted_form(method)
|
11
|
+
CONTRACTED_FORMS.each do |missing_piece, regexp|
|
12
|
+
index = @word =~ regexp
|
13
|
+
unless index.nil?
|
14
|
+
@word.insert(index, missing_piece)
|
15
|
+
@components[:contraction] = Contraction.new(index, missing_piece)
|
16
|
+
|
17
|
+
send(method)
|
18
|
+
@word.slice!(index, 2)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class Contraction
|
24
|
+
def initialize(position, contraction)
|
25
|
+
@position = position
|
26
|
+
@contraction = contraction
|
27
|
+
end
|
28
|
+
|
29
|
+
def empty?
|
30
|
+
# duck type, fulfilling the contract of the other component strings
|
31
|
+
false
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
"#{@contraction} contracted at #{@position}"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|