llt-morphologizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 07f6f8feba062fb30ccc3806c038ba3300c22916
4
+ data.tar.gz: bbcc6c448dfa30429312b522501cb2fa59ff742f
5
+ SHA512:
6
+ metadata.gz: d912133821b73df731b94b33ad2f82c340fcb1fd1200ee079dfe8a885c23a3565fb88f06242fd39140b5655268004c1b2212aac4d4b98362faddec61a4eec0e3
7
+ data.tar.gz: 75d0914bac36f5300c0e22840035faba2050d149ff9779e0a131609dca44419d913b104fcc8ea4b8770871c985c982d2f1e88d09c5173c0ccedaa979e37228f7
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,8 @@
1
+ ---
2
+ language: ruby
3
+ before_script:
4
+ - export JRUBY_OPTS=--2.0
5
+ rvm:
6
+ - 2.1.0
7
+ - 2.0.0
8
+ - jruby-1.7.8
data/Gemfile ADDED
@@ -0,0 +1,27 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in llt-morphologizer.gemspec
4
+ gemspec
5
+ gem 'coveralls', require: false
6
+
7
+ gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
8
+ gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
9
+ gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
10
+ gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
11
+ gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
12
+ gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
13
+ gem 'llt-logger', git: 'git://github.com/latin-language-toolkit/llt-logger.git'
14
+
15
+ # Dependencies of db_handler
16
+ gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
17
+
18
+ platform :ruby do
19
+ gem 'pg'
20
+ end
21
+
22
+ platform :jruby do
23
+ gem 'activerecord-jdbcpostgresql-adapter'
24
+ gem 'jruby-httpclient'
25
+ end
26
+
27
+ gem 'pry'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 LFDM
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # LLT::Morphologizer
2
+
3
+ [![Version](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/badge_fury)
4
+ [![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/gemnasium)
5
+ [![Build Status](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/travis.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/travis)
6
+ [![Coverage](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/coveralls.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/coveralls)
7
+ [![Code Climate](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/code_climate.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/code_climate)
8
+
9
+ Morphological parsing of Latin forms
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ gem 'llt-morphologizer'
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install llt-morphologizer
24
+
25
+ ## Usage
26
+
27
+ TODO: Write usage instructions here
28
+
29
+ ## Contributing
30
+
31
+ 1. Fork it ( http://github.com/<my-github-username>/llt-morphologizer/fork )
32
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
33
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
34
+ 4. Push to the branch (`git push origin my-new-feature`)
35
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,378 @@
1
+ require 'llt/constants'
2
+ require 'llt/core'
3
+ require 'llt/core_extensions/match_data'
4
+ require 'llt/db_handler/prometheus'
5
+ require 'llt/form_builder'
6
+ require 'llt/helpers/constantize'
7
+ require 'llt/helpers/normalizer'
8
+ require 'llt/helpers/pluralize'
9
+ require 'llt/helpers/primitive_cache'
10
+ require 'llt/logger'
11
+ require "llt/morphologizer/version"
12
+
13
+ module LLT
14
+ # Analyzes a token string morphologically.
15
+ #
16
+ # Looks up stems in a given db-dictionary and builds LLT::Form objects with the
17
+ # help of the LLT::FormBuilder.
18
+ class Morphologizer
19
+ require 'llt/morphologizer/stem_lookup_statement_builder'
20
+
21
+ include Core::Serviceable
22
+ include Helpers::Constantize
23
+ include Helpers::Normalizer
24
+ include Helpers::Pluralize
25
+ include Helpers::PrimitiveCache
26
+
27
+ uses_db { DbHandler::Prometheus.new }
28
+ uses_logger { Logger.new("Morphologizer", 2, default: :morph) }
29
+
30
+ # @option options [true] :cache enables caching
31
+ # @option options [DbHandler] :db db-handling object used to obtain stem information
32
+ # @option options [Logger] :logger object used for logging
33
+ def initialize(options = {})
34
+ super
35
+ enable_cache if options[:cache]
36
+ end
37
+
38
+ # Takes a string and analyzes it morphologically
39
+ #
40
+ # @param [String] word token to be analyzed
41
+ # @param add_to [#<<] Keyword Argument: can optionally defer the returned
42
+ # forms to an object
43
+ #
44
+ # @return [Array<LLT::Form>] all valid Latin forms of the given string
45
+ def morphologize(word, add_to: nil)
46
+ forms = cached(word) { compute(word) }
47
+ add_to << forms if add_to.respond_to?(:<<)
48
+ forms
49
+ end
50
+
51
+ private
52
+
53
+ def setup(word)
54
+ @word = word
55
+ @forms = []
56
+ @uniq = false
57
+ @statements = nil
58
+ end
59
+
60
+ def compute(word)
61
+ # the order is important, illustrated with the word cum.
62
+ # the preposition knows that it can have another form (the subjunction),
63
+ # while the subjunction says it's uniq.
64
+
65
+ setup(word)
66
+
67
+ return @forms if numerals
68
+ return @forms if prepositions && unique_present?
69
+ return @forms if look_up(:conjunctions) && unique_present?
70
+ return @forms if look_up(:subjunctions) && unique_present?
71
+ return @forms if clook_up(:personal_pronouns) && unique_pers_pron?
72
+ return @forms if other_pronouns && unique_pronoun?
73
+ return @forms if irregular_verbs && unique_present?
74
+ return @forms if clook_up(:cardinals) && unique_cardinal?
75
+
76
+ direct_lookup
77
+ indirect_lookup
78
+
79
+ @logger.error("Missing Word: #{@word}".red) if @forms.empty?
80
+ @forms
81
+ end
82
+
83
+
84
+ ######### Numerals #########
85
+
86
+ def numerals
87
+ if Helpers::RomanNumerals.roman?(@word)
88
+ add_form(Form::Cardinal.new(roman: @word))
89
+ end
90
+ end
91
+
92
+
93
+ ######### Personal Pronouns && Cardinals #########
94
+
95
+ # Complex Lookup
96
+ def clook_up(type)
97
+ if forms = LLT::Constants.const_get(type.upcase)[@word.downcase]
98
+ new_forms = forms.map do |form|
99
+ sg_type = type.to_s.chop # cardinals to cardinal
100
+ args = send("#{sg_type}_args", form)
101
+ constant_by_type(sg_type, namespace: LLT::Form).new(args)
102
+ end
103
+ add_forms(new_forms)
104
+ end
105
+ end
106
+
107
+ def personal_pronoun_args(pp)
108
+ # pp is an array of iclass, casus, numerus
109
+ ic, c, n = pp
110
+ stem, suffix = pers_pron_suffix_detection
111
+ { stem: stem, suffix: suffix, inflection_class: ic, casus: c, numerus: n }
112
+ end
113
+
114
+ HOMOPHONIC_PRONOUNS = Set.new(%w{ mei tui sui nostri nostrum vestri vestrum sese })
115
+ def unique_pers_pron?
116
+ ! HOMOPHONIC_PRONOUNS.include?(@word)
117
+ end
118
+
119
+ def pers_pron_suffix_detection
120
+ stem = @word.clone
121
+ stem.chomp!($1) if stem.match(/.*(cum|met|te)$/)
122
+ [stem, ($1 || "")]
123
+ end
124
+
125
+ def cardinal_args(cardinal)
126
+ # cardinal is an array
127
+ dec, c, n, s = cardinal
128
+ { decimal: dec, casus: c, numerus: n, sexus: s }
129
+ end
130
+
131
+ def unique_cardinal?
132
+ true # not sure if there is more needed.
133
+ end
134
+
135
+
136
+
137
+ ######### Other Pronouns #########
138
+
139
+ def other_pronouns
140
+ if m = pronouns_regexp.match(downcased)
141
+ pronoun_type = extract_pronoun_type(m)
142
+
143
+ stem = { type: :pronoun, inflection_class: pronoun_type }
144
+ new_forms = FormBuilder.build(stem.merge(options: opts_with_val(m.to_hash)))
145
+
146
+ add_forms(new_forms)
147
+ end
148
+ end
149
+
150
+ # quis and quid and all derivates (like aliquid) take a different
151
+ # path and use the substantivic endings
152
+ def extract_pronoun_type(m)
153
+ subst = (m[:ending] =~ /i[ds]$/ && m[:stem] == "qu") ? "_s" : ""
154
+ key = if m[:particle] == m[:stem] + m[:ending]
155
+ "quisquis"
156
+ else
157
+ # take only 2 chars of prefixed particle to match al(i)
158
+ # and all forms of un(us|ius...) - to_s for nils
159
+ "#{m[:prefixed_particle].to_s[0..1]}#{m[:stem]}#{m[:particle]}#{subst}"
160
+ end
161
+ PRONOUN_MAP[key.downcase]
162
+ end
163
+
164
+ PRONOUN_MAP = {
165
+ #stem + particle => :type
166
+ "hc" => :hic, "alcu" => :aliqui,
167
+ "h" => :hic, "alqu" => :aliqui,
168
+ "hu" => :hic, "alqu_s" => :aliquis,
169
+ "huc" => :hic, "culibet" => :quilibet,#subst?
170
+ "cu" => :qui, "qulibet" => :quilibet,
171
+ "qu" => :qui, "cuvis" => :quivis,
172
+ "qudam" => :quidam, "quvis" => :quivis,
173
+ "cudam" => :quidam, "qu_s" => :quis,
174
+ "qunam" => :quinam, "uterque" => :uterque,
175
+ "cunam" => :quinam, "utrque" => :uterque,
176
+ "i" => :is, "uter" => :uter,
177
+ "e" => :is, "utr" => :uter,
178
+ "ips" => :ipse, "quque" => :quisque,
179
+ "ill" => :ille, "cuque" => :quisque,
180
+ "ist" => :iste, "quque_s" => :quisque_s,
181
+ "idem" => :idem, "ququam" => :quisquam,
182
+ "edem" => :idem, "ququam_s" => :quisquam,
183
+ "qucumque" => :quicumque, "cuquam" => :quisquam,
184
+ "cucumque" => :quicumque, "quisquis" => :quisquis,
185
+ "alcu" => :aliqui, "ququid" => :quisquis,
186
+ "alqu" => :aliqui, "unquque_s" => :unusquisque_s,
187
+ "alqu_s" => :aliquis, "uncuque" => :unusquisque,
188
+ "qupiam_s" => :quispiam, "unquque" => :unusquisque,
189
+ "qupiam" => :quispiam, "cupiam" => :quispiam,
190
+ }
191
+
192
+ UNIQUE_PRONOUNS = Set.new(%w{ hic is eam eas eo i quam quod quo qua })
193
+ def unique_pronoun?
194
+ ! UNIQUE_PRONOUNS.include?(@word)
195
+ end
196
+
197
+ def pronouns_regexp
198
+ LLT::Constants::RegExps::PRONOUNS
199
+ end
200
+
201
+
202
+ ######### Irregular Verbs #########
203
+
204
+ def irregular_verbs
205
+ irregular_verbs_regexps.each do |verb, stems|
206
+ break if @uniq
207
+ stems.each do |stem_type, regexps|
208
+ regexps.each do |regexp|
209
+ if m = regexp.match(@word)
210
+ @logger.log("Matched irregular verb #{@word.yellow} with #{verb.to_s.yellow}")
211
+ stem_pack = irregular_stems(verb)
212
+ next unless stem_pack # temporary nexting, delete when all ISPs are written down
213
+
214
+
215
+ new_forms = create_forms(stem_type, stem_pack, m.to_hash)
216
+ add_forms(new_forms)
217
+
218
+ # We cannot immediately return as quite often another match
219
+ # will definitely made with the same lemma. Therefore only
220
+ # break at the top - that a match of esse cannot go to ire
221
+ # or anything else.
222
+ @uniq = true unless HOMOGRAPHIC_IRREGS[verb].match(@word)
223
+ end
224
+ end
225
+ end
226
+ end
227
+ end
228
+
229
+ HOMOGRAPHIC_IRREGS = {
230
+ ferre: /fero/,
231
+ ire: /subito/,
232
+ }
233
+ HOMOGRAPHIC_IRREGS.default = (/in_doubt_better_don't_match/)
234
+
235
+ def irregular_stems(key)
236
+ LLT::StemBuilder::IRREGULAR_STEMS[key]
237
+ end
238
+
239
+ def irregular_verbs_regexps
240
+ LLT::Constants::RegExps::IRREGULAR_VERBS
241
+ end
242
+
243
+
244
+ ######### Subjunctions & Conjunctions#########
245
+
246
+ def look_up(arg)
247
+ # A bit messy, the constants are saved in a format of
248
+ # key = string
249
+ # value = homophonous_forms?
250
+ # That's why we need to access the hash twice, as const[@word]
251
+ # could return false and thus fail # the conditional test
252
+ # with an inline assigment 'if (something = const[@word])'
253
+
254
+ const = Constants.const_get(arg.upcase)
255
+ w = downcased
256
+ if const.has_key?(w)
257
+ @uniq = true unless const[w]
258
+ add_form(Form.const_get(arg.to_s.chop.capitalize).new(string: @word))
259
+ end
260
+ end
261
+
262
+
263
+ ######### Prepositions #########
264
+
265
+ def prepositions
266
+ if prep = Constants::PREPOSITIONS[downcased]
267
+ # preps are { word => 4 6 not_uniq }
268
+ @uniq = true unless prep.last
269
+ takes_4th, takes_6th = prep[0..1]
270
+ args = { string: @word, takes_4th: takes_4th, takes_6th: takes_6th }
271
+ add_form(Form::Preposition.new(args))
272
+ end
273
+ end
274
+
275
+ ######### Direct Lookup like Adverbs #########
276
+
277
+ def direct_lookup
278
+ create_adverbs
279
+ end
280
+
281
+ def create_adverbs
282
+ entries = @db.direct_lookup(:adverb, downcased)
283
+ entries.each do |entry|
284
+ add_form(Form::Adverb.new(string: entry.word))
285
+ end
286
+ end
287
+
288
+
289
+ ######### Creation through DB #########
290
+
291
+ def indirect_lookup
292
+ statements
293
+ look_up_and_build_forms
294
+ end
295
+
296
+ def statements
297
+ @statements ||= StemLookupStatementBuilder.new(@word, @logger).statements
298
+ end
299
+
300
+ def look_up_and_build_forms
301
+ @statements.each do |statement|
302
+ @logger.log(statement.to_s)
303
+
304
+ stems = @db.look_up_stem(statement.to_query)
305
+
306
+ if stems.any?
307
+ @logger.bare("#{stems.size} #{pluralize(stems.size, 'entry')} found: #{stems.map(&:to_s) * ", "}", 8)
308
+
309
+ stems.each do |stem_pack|
310
+ type = t.send(statement.stem_type, :full)
311
+ new_forms = create_forms(type, stem_pack, statement.options)
312
+
313
+ add_forms(new_forms)
314
+ end
315
+ else
316
+ @logger.bare("0 entries found".yellow, 8)
317
+ end
318
+ end
319
+ end
320
+
321
+
322
+ ######### Helpers #########
323
+
324
+ def create_forms(selector, stem_pack, options)
325
+ forms = FormBuilder.build(stem_pack.to_hash(selector, opts_with_val(options)))
326
+ forms.each { |form| form.stems = stem_pack }
327
+ end
328
+
329
+ def log_form_creation(new_forms)
330
+ m = if new_forms.empty?
331
+ "No forms created".red
332
+ else
333
+ "#{new_forms.size} #{pluralize(new_forms.size, "form")} created: #{new_forms.map(&:to_s) * ", "}".green
334
+ end
335
+ @logger.bare(m, 8)
336
+ end
337
+
338
+ def opts_with_val(opts)
339
+ adapted_components(opts).merge(validate: true)
340
+ end
341
+
342
+ def add_form(form)
343
+ log_form_creation([form])
344
+ @forms << form
345
+ end
346
+
347
+ def add_forms(forms)
348
+ log_form_creation(forms)
349
+ @forms += forms
350
+ end
351
+
352
+ def unique_present?
353
+ @uniq
354
+ end
355
+
356
+ def downcased
357
+ @word.downcase
358
+ end
359
+
360
+ def adapted_components(comps)
361
+ # TODO 30.09.13 12:13 by LFDM
362
+ # Look fors nils in comps, probably due to regexps
363
+ #
364
+ # This method looks useless at first sight, as this is already done in LookupStatement to some extent,
365
+ # it's main use seems for some nil cases that need to be found, afterwards we can delete this.
366
+ comps.reject do |k, v|
367
+ if v
368
+ v.empty? unless k == :ending
369
+ else
370
+ true
371
+ end
372
+ end
373
+ end
374
+
375
+ private_constant :HOMOPHONIC_PRONOUNS, :PRONOUN_MAP, :UNIQUE_PRONOUNS,
376
+ :HOMOPHONIC_PRONOUNS, :HOMOGRAPHIC_IRREGS
377
+ end
378
+ end