llt-morphologizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ module LLT
2
+ class Morphologizer
3
+ class LookupStatement
4
+ attr_reader :components
5
+ alias :options :components
6
+
7
+ def initialize(*args)
8
+ @stem, @table, @column, @itypes, @components = args
9
+ safety_clones
10
+ end
11
+
12
+ def stem_type
13
+ @column
14
+ end
15
+
16
+ def type
17
+ @table
18
+ end
19
+
20
+ def to_query
21
+ {
22
+ type: @table,
23
+ stem: @stem,
24
+ stem_type: @column,
25
+ restrictions: build_restrictions
26
+ }
27
+ end
28
+
29
+ def to_s
30
+ "Looking up #{@stem.light_green} as #{@table}, #{@column} #{"with #{components_to_s}" if @components.any? } (classes: #{@itypes * ", "})"
31
+ end
32
+
33
+ private
34
+
35
+ # The methods that help in the creation of such instances are
36
+ # prepending and appending strings - especially the thematic.
37
+ # Just to be safe, clones this value.
38
+ def safety_clones
39
+ if thematic = @components[:thematic]
40
+ @components[:thematic] = thematic.clone
41
+ end
42
+ end
43
+
44
+ def components_to_s
45
+ @components.map do |k, v|
46
+ val = (v.empty? ? '""' : v)
47
+ "#{k} #{val.to_s.cyan}"
48
+ end.compact * ", "
49
+ end
50
+
51
+
52
+ def build_restrictions
53
+ kw = if @itypes.all? { |x| x.kind_of? Fixnum }
54
+ :inflection_class
55
+ else
56
+ :pf_composition
57
+ end
58
+
59
+ {
60
+ type: kw,
61
+ values: @itypes
62
+ }
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,130 @@
1
+ module LLT
2
+ class Morphologizer
3
+ class StemLookupStatementBuilder
4
+
5
+ require 'llt/morphologizer/stem_lookup_statement_builder/contracted_forms'
6
+ require 'llt/morphologizer/stem_lookup_statement_builder/declinable'
7
+ require 'llt/morphologizer/stem_lookup_statement_builder/conjugable'
8
+ require 'llt/morphologizer/lookup_statement'
9
+
10
+ include Declinable
11
+ include Conjugable
12
+
13
+ def initialize(word, log)
14
+ @word = word.clone # clone! because this will get sliced and reset continuously in this class
15
+ @log = log
16
+
17
+ @components = Hash.new { |h, k| h[k] = "" }
18
+ @lookup = {}
19
+ end
20
+
21
+ def stem
22
+ # a semantic help
23
+ @word
24
+ end
25
+
26
+ GETTER_METHODS = { components: %w{ thematic extension comparison_sign ending contraction },
27
+ lookup: %w{ table column itype } }
28
+
29
+ GETTER_METHODS.each do |inst_var, methods|
30
+ methods.each do |method|
31
+ class_eval <<-STR
32
+ def #{method}
33
+ @#{inst_var}[:#{method}]
34
+ end
35
+ STR
36
+ end
37
+ end
38
+
39
+ def statements
40
+ @statements = []
41
+ create_declinables
42
+ create_conjugables
43
+
44
+ @statements
45
+ end
46
+
47
+ def setup(operator)
48
+ @components.clear
49
+ @lookup = { table: "", column: "", itype: [] }
50
+ @operator = operator
51
+ end
52
+
53
+ def reset(*args)
54
+ args.flatten.each do |comp|
55
+ @word << @components.delete(comp).to_s
56
+ end
57
+ end
58
+
59
+ def all
60
+ @all_memo ||= %i{ thematic extension comparison_sign ending }
61
+ end
62
+
63
+ def has(arr)
64
+ type, components = arr
65
+ if result = scan(components, type)
66
+ slice_and_stash(type, result)
67
+ end
68
+ end
69
+
70
+ def scan(components, type)
71
+ # look what that's doing, it's a bit weird
72
+ components.flat_map {|x| @word.scan(x) }.first # that's brutally ugly
73
+ end
74
+
75
+ def slice_and_stash(type, result)
76
+ @components[type].prepend(@word.slice!(/#{result}$/))
77
+ end
78
+
79
+ def look_for the_table = :same, the_column = :same
80
+ unless the_table == :same
81
+ @lookup[:table] = the_table
82
+ @lookup[:column] = the_column
83
+ end
84
+
85
+ send("valid_itypes_for_#{@operator}")
86
+ add_statement!
87
+ add_additional_persona_place_or_ethnic_statement!
88
+ itype.clear
89
+ end
90
+
91
+ def add_statement!
92
+ if itype.empty?
93
+ @log.warning("#{stem} with #{@components[:ending]} has no searchable infl classes.")
94
+ else
95
+ # 2013-09-27 19:23 @components.clone substituted with rejection of empty strings - observe if this leads to trouble.
96
+ st = LookupStatement.new(cloned_stem, table, column, itype.clone, unemptied_components)
97
+ @statements << st
98
+ end
99
+ end
100
+
101
+ def unemptied_components
102
+ # leave ending always in - otherwise some words trigger build all forms (cf ita)
103
+ @components.reject { |k, v| v.empty? unless k == :ending }
104
+ end
105
+
106
+
107
+ def cloned_stem
108
+ s = stem.clone
109
+ s.downcase! unless persona_place_or_ethnic?
110
+ s
111
+ end
112
+
113
+ def persona_place_or_ethnic?
114
+ table == :persona || table == :place || table == :ethnic
115
+ end
116
+
117
+ def add_additional_persona_place_or_ethnic_statement!
118
+ if stem.match(/^[A-Z].*/)
119
+ case table
120
+ when :noun
121
+ @lookup[:table] = :persona and add_statement!
122
+ @lookup[:table] = :place and add_statement!
123
+ when :adjective
124
+ @lookup[:table] = :ethnic and add_statement! if column == :stem
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,221 @@
1
+ module LLT::Morphologizer::StemLookupStatementBuilder::Conjugable
2
+ include LLT::Morphologizer::StemLookupStatementBuilder::ContractedForms
3
+
4
+ class << self
5
+ def sg_1_active
6
+ [/(?<!tud|[^nu][st]i|ment|[bc]ul|[ao]ri|\Apr)o$/, /(?<![^s]u)m$/]
7
+ end
8
+
9
+ def sg_2_active
10
+ /(?<=[aer]|[^tr]i|[^aeirsl]ti|[^ai]ri|[^t][sft]eri|quiri|quaeri|\A[a-z]peri|[^a-z]geri|[^a-z]pari|[^a-z]meti)s$/
11
+ end
12
+
13
+ def sg_3_active
14
+ /(?<=\S[ae]|[is])t$/
15
+ end
16
+
17
+ def pl_1_active
18
+ /(?<!illi|erri|ssi|[^aeiu])mus$/
19
+ end
20
+
21
+ def pl_2_active
22
+ /(?<=[aei])tis$/
23
+ end
24
+
25
+ def pl_3_active
26
+ /(?<=[aeiu])nt$/
27
+ end
28
+
29
+ def sg_1_passive
30
+ [/(?<!u)or$/, /(?<!u)r$/]
31
+ end
32
+
33
+ def sg_2_passive
34
+ /(?<=[^p]a|[^afgtpsx]e|[^(qu)]i|[cr][uia]pe|[a-z][tg]e)ris$/
35
+ end
36
+
37
+ def sg_3_passive
38
+ /(?<=[aei])tur$/
39
+ end
40
+
41
+ def pl_1_passive
42
+ /mur$/
43
+ end
44
+
45
+ def pl_2_passive
46
+ /(?<=[aei])mini$/
47
+ end
48
+
49
+ def pl_3_passive
50
+ /(?<=[aeiu])ntur$/
51
+ end
52
+ end
53
+
54
+ PRIMARY_ENDING = [:ending, [ *sg_1_active, sg_2_active, sg_3_active, pl_1_active, pl_2_active, pl_3_active,
55
+ *sg_1_passive, sg_3_passive, pl_1_passive, pl_2_passive, pl_3_passive]]
56
+ PRIMARY_ENDING_SG_2_PASSIVE = [:ending, [sg_2_passive]]
57
+
58
+ SECONDARY_ENDING = [:ending, [ /isti$/, /(?<=[^rnt])i$/, /it$/, /imus$/, /istis$/, /erunt$/, /ere$/ ]]
59
+ IMPERATIVE_ENDING = [:ending, [ /(?<=[aei])te$/, /tote$/, /(?<=[^ieu]a$|e$|[^min][^uv]i$)/, /(?<=[^n])to$/, /nto$/,
60
+ /(?<=[^n])tor$/, /ntor$/]]
61
+ DEP_IMP_ENDING = [:ending, [ /(?<=[aei])re$/ ]]
62
+
63
+ PERFECT_EXTENSIONS = [:extension, [/er$|er[ai]$|isse$/]]
64
+ IMPERFECT_BA = [:extension, [/ba$/]]
65
+ FUTURE_B = [:extension, [/[b]$/]]
66
+ FUTURE_OR_SUBJUNCTIVE_A_OR_E = [:extension, [/[ae]$/]]
67
+ SUBJUNCTIVE_IMPERFECT = [:extension, [/re$/]]
68
+
69
+ THEMATIC_VOWEL = [:thematic, [/[eiu]$/]]
70
+ THEMATIC_I_OF_M = [:thematic, [/[i]$/]]
71
+ THEMATIC_E_OF_SUBJUNCTIVE_IMPERFECT = [:thematic, [/e$/]]
72
+
73
+ # (?<=[aei])re not needed here as inf pr, - the dep_imp_ending finds it
74
+ # anyway, the FormBuilder cares for the rest.
75
+ INFINITIVE_PR = [:ending, [/(?<=[aei])ri$|(?<=[^aeior])i$|r?ier$/]]
76
+ INFINITIVE_PF = [:ending, [/isse$/]]
77
+
78
+ def create_conjugables
79
+ setup(:conjugable)
80
+ search_for_contracted_form(:conjugable_search)
81
+
82
+ setup(:conjugable)
83
+ conjugable_search
84
+ end
85
+
86
+
87
+ def conjugable_search
88
+ secondary_ending
89
+ primary_ending(PRIMARY_ENDING)
90
+ primary_ending(PRIMARY_ENDING_SG_2_PASSIVE)
91
+ imperative
92
+ infinitive
93
+ end
94
+
95
+ private
96
+
97
+ def secondary_ending
98
+ if has SECONDARY_ENDING
99
+ look_for :verb, :pf
100
+ reset all
101
+ end
102
+ end
103
+
104
+ def primary_ending(const)
105
+ if has const then look_for :verb, :pr
106
+
107
+ if has IMPERFECT_BA then look_for :verb, :pr
108
+ if has THEMATIC_VOWEL then look_for :same; end
109
+ if has THEMATIC_I_OF_M then look_for :same; end
110
+ reset :thematic, :extension
111
+ end
112
+
113
+ if has THEMATIC_VOWEL then look_for :same
114
+ if has THEMATIC_I_OF_M then look_for :same; end
115
+ if has FUTURE_B
116
+ look_for :same
117
+ reset :extension, :thematic
118
+ else reset :thematic, :extension
119
+ end
120
+ end
121
+
122
+ if has FUTURE_B then look_for :same
123
+ reset :thematic, :extension
124
+ end
125
+
126
+ if has FUTURE_OR_SUBJUNCTIVE_A_OR_E then look_for :same
127
+ subjunctive_present_of_A_conjugation
128
+ if has THEMATIC_I_OF_M then look_for :same; end
129
+ reset :thematic, :extension
130
+ end
131
+
132
+ if has SUBJUNCTIVE_IMPERFECT then look_for :same
133
+ if has THEMATIC_E_OF_SUBJUNCTIVE_IMPERFECT then look_for :same; end
134
+ end
135
+
136
+ first_person_present_of_A_conjugation
137
+
138
+ reset :thematic, :extension
139
+
140
+ if has PERFECT_EXTENSIONS
141
+ look_for :verb, :pf
142
+ end
143
+
144
+ reset all
145
+ end
146
+ end
147
+
148
+ def imperative
149
+ unless short_imperative
150
+ if has IMPERATIVE_ENDING then look_for :verb, :pr
151
+ if has THEMATIC_VOWEL then look_for :same
152
+ if has THEMATIC_I_OF_M then look_for :same; end
153
+ end
154
+ reset all
155
+ end
156
+
157
+ if has DEP_IMP_ENDING then look_for :verb, :pr
158
+ if has THEMATIC_VOWEL then look_for :same
159
+ if has THEMATIC_I_OF_M then look_for :same; end
160
+ end
161
+ reset all
162
+ end
163
+ end
164
+ end
165
+
166
+ def infinitive
167
+ if has INFINITIVE_PR
168
+ look_for :verb, :pr
169
+ if has THEMATIC_VOWEL
170
+ look_for :same
171
+ end
172
+ reset all
173
+ end
174
+
175
+ if has INFINITIVE_PF
176
+ look_for :verb, :pf
177
+ reset all
178
+ end
179
+ end
180
+
181
+ def subjunctive_present_of_A_conjugation
182
+ append_a_search_and_chop_it if extension == "e" && stem !~ /i$/
183
+ end
184
+
185
+ def first_person_present_of_A_conjugation
186
+ append_a_search_and_chop_it if ending =~ /(o|or)$/ && extension.empty? # laudavero
187
+ end
188
+
189
+ def append_a_search_and_chop_it
190
+ stem << "a"
191
+ look_for :same
192
+ stem.chop!
193
+ end
194
+
195
+ def short_imperative
196
+ if stem =~ /dic$|duc$|fac$|fer$/
197
+ look_for :verb, :pr # had return true before, but look_for should return true anyway
198
+ end
199
+ end
200
+
201
+ def valid_itypes_for_conjugable
202
+ if column == :pr
203
+ itype << 1 if stem =~ /a$/
204
+ itype << 2 if stem =~ /e$/
205
+ itype << 3 if stem =~ /[^aeio]$/ && thematic != "iu"
206
+ itype << 4 if stem =~ /i$/
207
+ itype << 5 if stem =~ /[^aeio]$/ && thematic != "u"
208
+ end
209
+
210
+ if column == :pf
211
+ itype << "v" if stem =~ /v$/
212
+ itype << "u" if stem =~ /u$/
213
+ itype << "s" if stem =~ /s$|x$/
214
+ itype << "else" if stem !~ /v$|u$|s$|x$/
215
+ itype << "ablaut" if stem !~ /v$|u$|s$|x$/
216
+ # regexps needed
217
+ itype << "reduplication" if stem
218
+ end
219
+ end
220
+ end
221
+
@@ -0,0 +1,38 @@
1
+ module LLT::Morphologizer::StemLookupStatementBuilder::ContractedForms
2
+
3
+ CONTRACTED_FORMS = {
4
+ "v" => /(?<=[^v]i)(er[aiu]nt|er[ia][mst]|er[ia]mus|er[ia]tis|ero)$/,
5
+ "vi" => /(?<=[^v][aeio])(stis?|sse[mst]|ssemus|ssetis|ssent|sse)$/,
6
+ "ve" => /(?<=[^v][aeo])(r[aiu]nt|r[ia][mst]|r[ia]mus|r[ia]tis|ro)$/
7
+ }
8
+
9
+
10
+ def search_for_contracted_form(method)
11
+ CONTRACTED_FORMS.each do |missing_piece, regexp|
12
+ index = @word =~ regexp
13
+ unless index.nil?
14
+ @word.insert(index, missing_piece)
15
+ @components[:contraction] = Contraction.new(index, missing_piece)
16
+
17
+ send(method)
18
+ @word.slice!(index, 2)
19
+ end
20
+ end
21
+ end
22
+
23
+ class Contraction
24
+ def initialize(position, contraction)
25
+ @position = position
26
+ @contraction = contraction
27
+ end
28
+
29
+ def empty?
30
+ # duck type, fulfilling the contract of the other component strings
31
+ false
32
+ end
33
+
34
+ def to_s
35
+ "#{@contraction} contracted at #{@position}"
36
+ end
37
+ end
38
+ end