daidai 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Daidai
4
+ class Error < StandardError; end
5
+
6
+ # Optional resolver backed by the `kabosu` gem (Ruby bindings for the Sudachi
7
+ # morphological analyzer). Turns a bare word — even an inflected one like
8
+ # "食べている" — into its dictionary form and JMdict part of speech, so you can
9
+ # conjugate without naming the POS yourself:
10
+ #
11
+ # Daidai.conjugate("食べている") # kabosu finds 食べる / v1, then conjugates
12
+ #
13
+ # This needs the `kabosu` gem plus an installed Sudachi dictionary. Neither is
14
+ # a hard dependency of daidai — the rest of the gem is pure Ruby and
15
+ # zero-dependency. The escape hatch is simply to pass the POS, in which case
16
+ # kabosu never runs:
17
+ #
18
+ # Daidai.conjugate("食べる", "v1")
19
+ #
20
+ # NOTE: this module is nested inside Daidai, so the top-level kabosu gem must
21
+ # be referenced as ::Kabosu to avoid resolving back to Daidai::Kabosu.
22
+ module Kabosu
23
+ # Raised when a POS-less conjugation is requested but the `kabosu` gem and a
24
+ # Sudachi dictionary aren't available.
25
+ class MissingDependency < Error; end
26
+
27
+ # Sudachi 活用型 (conjugation type) => JMdict POS code. Sudachi names the verb
28
+ # row but not the JMdict subclass for a handful of irregulars, so LEMMA_POS
29
+ # overrides those by dictionary form.
30
+ CONJUGATION_TYPE = {
31
+ "五段-カ行" => "v5k", "五段-ガ行" => "v5g", "五段-サ行" => "v5s",
32
+ "五段-タ行" => "v5t", "五段-ナ行" => "v5n", "五段-バ行" => "v5b",
33
+ "五段-マ行" => "v5m", "五段-ラ行" => "v5r", "五段-ワア行" => "v5u",
34
+ "カ行変格" => "vk", "サ行変格" => "vs-i"
35
+ }.freeze
36
+
37
+ # Dictionary-form overrides for verbs whose JMdict subclass Sudachi's 活用型
38
+ # can't distinguish (irregular okurigana inside an otherwise-regular row).
39
+ LEMMA_POS = {
40
+ "行く" => "v5k-s", "逝く" => "v5k-s", "往く" => "v5k-s",
41
+ "有る" => "v5r-i", "在る" => "v5r-i", "ある" => "v5r-i"
42
+ }.freeze
43
+
44
+ class << self
45
+ # Resolve `text` to { word:, pos:, reading: } from its first inflecting
46
+ # morpheme, or nil when nothing conjugatable is found. Raises
47
+ # MissingDependency when kabosu/a dictionary isn't installed.
48
+ def resolve(text)
49
+ morphemes = tokenizer.tokenize(text).to_a
50
+ index = morphemes.index { |m| inflecting?(m.part_of_speech) }
51
+ return nil unless index
52
+
53
+ morpheme = morphemes[index]
54
+ preceding = index.positive? ? morphemes[index - 1] : nil
55
+
56
+ # 名詞+する compounds (勉強した → 勉強, vs): the noun is the dictionary entry.
57
+ if suru?(morpheme.part_of_speech) && preceding && suru_noun?(preceding.part_of_speech)
58
+ return entry(preceding, "vs")
59
+ end
60
+
61
+ pos = jmdict_pos(morpheme.part_of_speech, morpheme.dictionary_form)
62
+ pos && entry(morpheme, pos)
63
+ end
64
+
65
+ # Pure mapping: a Sudachi part-of-speech array + dictionary form => JMdict
66
+ # POS code, or nil. Exposed (and unit-tested) without needing kabosu.
67
+ def jmdict_pos(pos, lemma)
68
+ LEMMA_POS[lemma] || from_conjugation_type(pos)
69
+ end
70
+
71
+ # Whether the resolver is usable (kabosu loadable + a dictionary present).
72
+ def available?
73
+ !tokenizer.nil?
74
+ rescue MissingDependency
75
+ false
76
+ end
77
+
78
+ def reset! = (@tokenizer = nil)
79
+
80
+ private
81
+
82
+ def from_conjugation_type(pos)
83
+ case pos[0]
84
+ when "動詞"
85
+ CONJUGATION_TYPE[pos[4]] || (pos[4].to_s.start_with?("上一段", "下一段") ? "v1" : nil)
86
+ when "形容詞" then "adj-i"
87
+ when "形状詞" then "adj-na"
88
+ end
89
+ end
90
+
91
+ def inflecting?(pos)
92
+ %w[動詞 形容詞 形状詞].include?(pos[0])
93
+ end
94
+
95
+ def suru?(pos) = pos[4] == "サ行変格"
96
+ def suru_noun?(pos) = pos[0] == "名詞" && pos[2] == "サ変可能"
97
+
98
+ def entry(morpheme, pos)
99
+ { word: morpheme.dictionary_form, pos: pos, reading: dictionary_reading(morpheme) }
100
+ end
101
+
102
+ # `reading_form` is the *surface* reading; it matches the dictionary form
103
+ # only when the input wasn't inflected. Otherwise leave the reading to
104
+ # conjugate — a kana word is its own reading, and a kanji word just omits
105
+ # the kana column rather than carry a wrong one.
106
+ def dictionary_reading(morpheme)
107
+ return nil unless morpheme.surface == morpheme.dictionary_form
108
+
109
+ hiragana(morpheme.reading_form)
110
+ end
111
+
112
+ def tokenizer
113
+ @tokenizer ||= build_tokenizer
114
+ end
115
+
116
+ def build_tokenizer
117
+ require "kabosu"
118
+ ::Kabosu::Dictionary.new(system_dict: ::Kabosu::Dictionary.path).create(mode: :c)
119
+ rescue LoadError
120
+ raise MissingDependency,
121
+ 'Daidai.conjugate(word) without a POS needs the `kabosu` gem. Add `gem "kabosu"` ' \
122
+ '(and install a Sudachi dictionary), or pass the JMdict POS: Daidai.conjugate(word, "v5k").'
123
+ rescue StandardError => e
124
+ raise MissingDependency,
125
+ "Sudachi is unavailable (#{e.message}). Install a dictionary (e.g. `rake kabosu:install`), " \
126
+ 'or pass the JMdict POS explicitly: Daidai.conjugate(word, "v5k").'
127
+ end
128
+
129
+ def hiragana(text)
130
+ text.to_s.tr("ァ-ヴ", "ぁ-ゔ")
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,14 @@
1
+ id name
2
+ 1 Non-past
3
+ 2 Past (~ta)
4
+ 3 Conjunctive (~te)
5
+ 4 Provisional (~eba)
6
+ 5 Potential
7
+ 6 Passive
8
+ 7 Causative
9
+ 8 Causative-Passive
10
+ 9 Volitional
11
+ 10 Imperative
12
+ 11 Conditional (~tara)
13
+ 12 Alternative (~tari)
14
+ 13 Continuative (~i)