llt-morphologizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 07f6f8feba062fb30ccc3806c038ba3300c22916
4
+ data.tar.gz: bbcc6c448dfa30429312b522501cb2fa59ff742f
5
+ SHA512:
6
+ metadata.gz: d912133821b73df731b94b33ad2f82c340fcb1fd1200ee079dfe8a885c23a3565fb88f06242fd39140b5655268004c1b2212aac4d4b98362faddec61a4eec0e3
7
+ data.tar.gz: 75d0914bac36f5300c0e22840035faba2050d149ff9779e0a131609dca44419d913b104fcc8ea4b8770871c985c982d2f1e88d09c5173c0ccedaa979e37228f7
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,8 @@
1
+ ---
2
+ language: ruby
3
+ before_script:
4
+ - export JRUBY_OPTS=--2.0
5
+ rvm:
6
+ - 2.1.0
7
+ - 2.0.0
8
+ - jruby-1.7.8
data/Gemfile ADDED
@@ -0,0 +1,27 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in llt-morphologizer.gemspec
4
+ gemspec
5
+ gem 'coveralls', require: false
6
+
7
+ gem 'llt-core', git: 'git@github.com:latin-language-toolkit/llt-core.git'
8
+ gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
9
+ gem 'llt-constants', git: 'git@github.com:latin-language-toolkit/llt-constants.git'
10
+ gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler.git'
11
+ gem 'llt-db_handler-stub', git: 'git@github.com:latin-language-toolkit/llt-db_handler-stub.git'
12
+ gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
13
+ gem 'llt-logger', git: 'git://github.com/latin-language-toolkit/llt-logger.git'
14
+
15
+ # Dependencies of db_handler
16
+ gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
17
+
18
+ platform :ruby do
19
+ gem 'pg'
20
+ end
21
+
22
+ platform :jruby do
23
+ gem 'activerecord-jdbcpostgresql-adapter'
24
+ gem 'jruby-httpclient'
25
+ end
26
+
27
+ gem 'pry'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 LFDM
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # LLT::Morphologizer
2
+
3
+ [![Version](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/badge_fury.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/badge_fury)
4
+ [![Dependencies](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/gemnasium.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/gemnasium)
5
+ [![Build Status](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/travis.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/travis)
6
+ [![Coverage](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/coveralls.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/coveralls)
7
+ [![Code Climate](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/code_climate.png)](http://allthebadges.io/latin-language-toolkit/llt-morphologizer/code_climate)
8
+
9
+ Morphological parsing of Latin forms
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ gem 'llt-morphologizer'
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install llt-morphologizer
24
+
25
+ ## Usage
26
+
27
+ TODO: Write usage instructions here
28
+
29
+ ## Contributing
30
+
31
+ 1. Fork it ( http://github.com/<my-github-username>/llt-morphologizer/fork )
32
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
33
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
34
+ 4. Push to the branch (`git push origin my-new-feature`)
35
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,378 @@
1
+ require 'llt/constants'
2
+ require 'llt/core'
3
+ require 'llt/core_extensions/match_data'
4
+ require 'llt/db_handler/prometheus'
5
+ require 'llt/form_builder'
6
+ require 'llt/helpers/constantize'
7
+ require 'llt/helpers/normalizer'
8
+ require 'llt/helpers/pluralize'
9
+ require 'llt/helpers/primitive_cache'
10
+ require 'llt/logger'
11
+ require "llt/morphologizer/version"
12
+
13
+ module LLT
14
+ # Analyzes a token string morphologically.
15
+ #
16
+ # Looks up stems in a given db-dictionary and builds LLT::Form objects with the
17
+ # help of the LLT::FormBuilder.
18
+ class Morphologizer
19
+ require 'llt/morphologizer/stem_lookup_statement_builder'
20
+
21
+ include Core::Serviceable
22
+ include Helpers::Constantize
23
+ include Helpers::Normalizer
24
+ include Helpers::Pluralize
25
+ include Helpers::PrimitiveCache
26
+
27
+ uses_db { DbHandler::Prometheus.new }
28
+ uses_logger { Logger.new("Morphologizer", 2, default: :morph) }
29
+
30
+ # @option options [true] :cache enables caching
31
+ # @option options [DbHandler] :db db-handling object used to obtain stem information
32
+ # @option options [Logger] :logger object used for logging
33
+ def initialize(options = {})
34
+ super
35
+ enable_cache if options[:cache]
36
+ end
37
+
38
+ # Takes a string and analyzes it morphologically
39
+ #
40
+ # @param [String] word token to be analyzed
41
+ # @param add_to [#<<] Keyword Argument: can optionally defer the returned
42
+ # forms to an object
43
+ #
44
+ # @return [Array<LLT::Form>] all valid Latin forms of the given string
45
+ def morphologize(word, add_to: nil)
46
+ forms = cached(word) { compute(word) }
47
+ add_to << forms if add_to.respond_to?(:<<)
48
+ forms
49
+ end
50
+
51
+ private
52
+
53
+ def setup(word)
54
+ @word = word
55
+ @forms = []
56
+ @uniq = false
57
+ @statements = nil
58
+ end
59
+
60
+ def compute(word)
61
+ # the order is important, illustrated with the word cum.
62
+ # the preposition knows that it can have another form (the subjunction),
63
+ # while the subjunction says it's uniq.
64
+
65
+ setup(word)
66
+
67
+ return @forms if numerals
68
+ return @forms if prepositions && unique_present?
69
+ return @forms if look_up(:conjunctions) && unique_present?
70
+ return @forms if look_up(:subjunctions) && unique_present?
71
+ return @forms if clook_up(:personal_pronouns) && unique_pers_pron?
72
+ return @forms if other_pronouns && unique_pronoun?
73
+ return @forms if irregular_verbs && unique_present?
74
+ return @forms if clook_up(:cardinals) && unique_cardinal?
75
+
76
+ direct_lookup
77
+ indirect_lookup
78
+
79
+ @logger.error("Missing Word: #{@word}".red) if @forms.empty?
80
+ @forms
81
+ end
82
+
83
+
84
+ ######### Numerals #########
85
+
86
+ def numerals
87
+ if Helpers::RomanNumerals.roman?(@word)
88
+ add_form(Form::Cardinal.new(roman: @word))
89
+ end
90
+ end
91
+
92
+
93
+ ######### Personal Pronouns && Cardinals #########
94
+
95
+ # Complex Lookup
96
+ def clook_up(type)
97
+ if forms = LLT::Constants.const_get(type.upcase)[@word.downcase]
98
+ new_forms = forms.map do |form|
99
+ sg_type = type.to_s.chop # cardinals to cardinal
100
+ args = send("#{sg_type}_args", form)
101
+ constant_by_type(sg_type, namespace: LLT::Form).new(args)
102
+ end
103
+ add_forms(new_forms)
104
+ end
105
+ end
106
+
107
+ def personal_pronoun_args(pp)
108
+ # pp is an array of iclass, casus, numerus
109
+ ic, c, n = pp
110
+ stem, suffix = pers_pron_suffix_detection
111
+ { stem: stem, suffix: suffix, inflection_class: ic, casus: c, numerus: n }
112
+ end
113
+
114
+ HOMOPHONIC_PRONOUNS = Set.new(%w{ mei tui sui nostri nostrum vestri vestrum sese })
115
+ def unique_pers_pron?
116
+ ! HOMOPHONIC_PRONOUNS.include?(@word)
117
+ end
118
+
119
+ def pers_pron_suffix_detection
120
+ stem = @word.clone
121
+ stem.chomp!($1) if stem.match(/.*(cum|met|te)$/)
122
+ [stem, ($1 || "")]
123
+ end
124
+
125
+ def cardinal_args(cardinal)
126
+ # cardinal is an array
127
+ dec, c, n, s = cardinal
128
+ { decimal: dec, casus: c, numerus: n, sexus: s }
129
+ end
130
+
131
+ def unique_cardinal?
132
+ true # not sure if there is more needed.
133
+ end
134
+
135
+
136
+
137
+ ######### Other Pronouns #########
138
+
139
+ def other_pronouns
140
+ if m = pronouns_regexp.match(downcased)
141
+ pronoun_type = extract_pronoun_type(m)
142
+
143
+ stem = { type: :pronoun, inflection_class: pronoun_type }
144
+ new_forms = FormBuilder.build(stem.merge(options: opts_with_val(m.to_hash)))
145
+
146
+ add_forms(new_forms)
147
+ end
148
+ end
149
+
150
+ # quis and quid and all derivates (like aliquid) take a different
151
+ # path and use the substantivic endings
152
+ def extract_pronoun_type(m)
153
+ subst = (m[:ending] =~ /i[ds]$/ && m[:stem] == "qu") ? "_s" : ""
154
+ key = if m[:particle] == m[:stem] + m[:ending]
155
+ "quisquis"
156
+ else
157
+ # take only 2 chars of prefixed particle to match al(i)
158
+ # and all forms of un(us|ius...) - to_s for nils
159
+ "#{m[:prefixed_particle].to_s[0..1]}#{m[:stem]}#{m[:particle]}#{subst}"
160
+ end
161
+ PRONOUN_MAP[key.downcase]
162
+ end
163
+
164
+ PRONOUN_MAP = {
165
+ #stem + particle => :type
166
+ "hc" => :hic, "alcu" => :aliqui,
167
+ "h" => :hic, "alqu" => :aliqui,
168
+ "hu" => :hic, "alqu_s" => :aliquis,
169
+ "huc" => :hic, "culibet" => :quilibet,#subst?
170
+ "cu" => :qui, "qulibet" => :quilibet,
171
+ "qu" => :qui, "cuvis" => :quivis,
172
+ "qudam" => :quidam, "quvis" => :quivis,
173
+ "cudam" => :quidam, "qu_s" => :quis,
174
+ "qunam" => :quinam, "uterque" => :uterque,
175
+ "cunam" => :quinam, "utrque" => :uterque,
176
+ "i" => :is, "uter" => :uter,
177
+ "e" => :is, "utr" => :uter,
178
+ "ips" => :ipse, "quque" => :quisque,
179
+ "ill" => :ille, "cuque" => :quisque,
180
+ "ist" => :iste, "quque_s" => :quisque_s,
181
+ "idem" => :idem, "ququam" => :quisquam,
182
+ "edem" => :idem, "ququam_s" => :quisquam,
183
+ "qucumque" => :quicumque, "cuquam" => :quisquam,
184
+ "cucumque" => :quicumque, "quisquis" => :quisquis,
185
+ "alcu" => :aliqui, "ququid" => :quisquis,
186
+ "alqu" => :aliqui, "unquque_s" => :unusquisque_s,
187
+ "alqu_s" => :aliquis, "uncuque" => :unusquisque,
188
+ "qupiam_s" => :quispiam, "unquque" => :unusquisque,
189
+ "qupiam" => :quispiam, "cupiam" => :quispiam,
190
+ }
191
+
192
+ UNIQUE_PRONOUNS = Set.new(%w{ hic is eam eas eo i quam quod quo qua })
193
+ def unique_pronoun?
194
+ ! UNIQUE_PRONOUNS.include?(@word)
195
+ end
196
+
197
+ def pronouns_regexp
198
+ LLT::Constants::RegExps::PRONOUNS
199
+ end
200
+
201
+
202
+ ######### Irregular Verbs #########
203
+
204
+ def irregular_verbs
205
+ irregular_verbs_regexps.each do |verb, stems|
206
+ break if @uniq
207
+ stems.each do |stem_type, regexps|
208
+ regexps.each do |regexp|
209
+ if m = regexp.match(@word)
210
+ @logger.log("Matched irregular verb #{@word.yellow} with #{verb.to_s.yellow}")
211
+ stem_pack = irregular_stems(verb)
212
+ next unless stem_pack # temporary nexting, delete when all ISPs are written down
213
+
214
+
215
+ new_forms = create_forms(stem_type, stem_pack, m.to_hash)
216
+ add_forms(new_forms)
217
+
218
+ # We cannot immediately return as quite often another match
219
+ # will definitely made with the same lemma. Therefore only
220
+ # break at the top - that a match of esse cannot go to ire
221
+ # or anything else.
222
+ @uniq = true unless HOMOGRAPHIC_IRREGS[verb].match(@word)
223
+ end
224
+ end
225
+ end
226
+ end
227
+ end
228
+
229
+ HOMOGRAPHIC_IRREGS = {
230
+ ferre: /fero/,
231
+ ire: /subito/,
232
+ }
233
+ HOMOGRAPHIC_IRREGS.default = (/in_doubt_better_don't_match/)
234
+
235
+ def irregular_stems(key)
236
+ LLT::StemBuilder::IRREGULAR_STEMS[key]
237
+ end
238
+
239
+ def irregular_verbs_regexps
240
+ LLT::Constants::RegExps::IRREGULAR_VERBS
241
+ end
242
+
243
+
244
+ ######### Subjunctions & Conjunctions#########
245
+
246
+ def look_up(arg)
247
+ # A bit messy, the constants are saved in a format of
248
+ # key = string
249
+ # value = homophonous_forms?
250
+ # That's why we need to access the hash twice, as const[@word]
251
+ # could return false and thus fail # the conditional test
252
+ # with an inline assigment 'if (something = const[@word])'
253
+
254
+ const = Constants.const_get(arg.upcase)
255
+ w = downcased
256
+ if const.has_key?(w)
257
+ @uniq = true unless const[w]
258
+ add_form(Form.const_get(arg.to_s.chop.capitalize).new(string: @word))
259
+ end
260
+ end
261
+
262
+
263
+ ######### Prepositions #########
264
+
265
+ def prepositions
266
+ if prep = Constants::PREPOSITIONS[downcased]
267
+ # preps are { word => 4 6 not_uniq }
268
+ @uniq = true unless prep.last
269
+ takes_4th, takes_6th = prep[0..1]
270
+ args = { string: @word, takes_4th: takes_4th, takes_6th: takes_6th }
271
+ add_form(Form::Preposition.new(args))
272
+ end
273
+ end
274
+
275
+ ######### Direct Lookup like Adverbs #########
276
+
277
+ def direct_lookup
278
+ create_adverbs
279
+ end
280
+
281
+ def create_adverbs
282
+ entries = @db.direct_lookup(:adverb, downcased)
283
+ entries.each do |entry|
284
+ add_form(Form::Adverb.new(string: entry.word))
285
+ end
286
+ end
287
+
288
+
289
+ ######### Creation through DB #########
290
+
291
+ def indirect_lookup
292
+ statements
293
+ look_up_and_build_forms
294
+ end
295
+
296
+ def statements
297
+ @statements ||= StemLookupStatementBuilder.new(@word, @logger).statements
298
+ end
299
+
300
+ def look_up_and_build_forms
301
+ @statements.each do |statement|
302
+ @logger.log(statement.to_s)
303
+
304
+ stems = @db.look_up_stem(statement.to_query)
305
+
306
+ if stems.any?
307
+ @logger.bare("#{stems.size} #{pluralize(stems.size, 'entry')} found: #{stems.map(&:to_s) * ", "}", 8)
308
+
309
+ stems.each do |stem_pack|
310
+ type = t.send(statement.stem_type, :full)
311
+ new_forms = create_forms(type, stem_pack, statement.options)
312
+
313
+ add_forms(new_forms)
314
+ end
315
+ else
316
+ @logger.bare("0 entries found".yellow, 8)
317
+ end
318
+ end
319
+ end
320
+
321
+
322
+ ######### Helpers #########
323
+
324
+ def create_forms(selector, stem_pack, options)
325
+ forms = FormBuilder.build(stem_pack.to_hash(selector, opts_with_val(options)))
326
+ forms.each { |form| form.stems = stem_pack }
327
+ end
328
+
329
+ def log_form_creation(new_forms)
330
+ m = if new_forms.empty?
331
+ "No forms created".red
332
+ else
333
+ "#{new_forms.size} #{pluralize(new_forms.size, "form")} created: #{new_forms.map(&:to_s) * ", "}".green
334
+ end
335
+ @logger.bare(m, 8)
336
+ end
337
+
338
+ def opts_with_val(opts)
339
+ adapted_components(opts).merge(validate: true)
340
+ end
341
+
342
+ def add_form(form)
343
+ log_form_creation([form])
344
+ @forms << form
345
+ end
346
+
347
+ def add_forms(forms)
348
+ log_form_creation(forms)
349
+ @forms += forms
350
+ end
351
+
352
+ def unique_present?
353
+ @uniq
354
+ end
355
+
356
+ def downcased
357
+ @word.downcase
358
+ end
359
+
360
+ def adapted_components(comps)
361
+ # TODO 30.09.13 12:13 by LFDM
362
+ # Look fors nils in comps, probably due to regexps
363
+ #
364
+ # This method looks useless at first sight, as this is already done in LookupStatement to some extent,
365
+ # it's main use seems for some nil cases that need to be found, afterwards we can delete this.
366
+ comps.reject do |k, v|
367
+ if v
368
+ v.empty? unless k == :ending
369
+ else
370
+ true
371
+ end
372
+ end
373
+ end
374
+
375
+ private_constant :HOMOPHONIC_PRONOUNS, :PRONOUN_MAP, :UNIQUE_PRONOUNS,
376
+ :HOMOPHONIC_PRONOUNS, :HOMOGRAPHIC_IRREGS
377
+ end
378
+ end