daidai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +31 -0
- data/LICENSE +674 -0
- data/NOTICE +42 -0
- data/README.md +267 -0
- data/lib/daidai/conjugator.rb +122 -0
- data/lib/daidai/deinflector.rb +211 -0
- data/lib/daidai/kabosu.rb +134 -0
- data/lib/daidai/resources/conj.csv +14 -0
- data/lib/daidai/resources/conjo.csv +1138 -0
- data/lib/daidai/resources/conotes.csv +18 -0
- data/lib/daidai/resources/japanese-transforms.json +8847 -0
- data/lib/daidai/resources/kwpos.csv +93 -0
- data/lib/daidai/tables.rb +55 -0
- data/lib/daidai/version.rb +5 -0
- data/lib/daidai/word.rb +134 -0
- data/lib/daidai.rb +75 -0
- metadata +121 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Daidai
|
|
4
|
+
class Error < StandardError; end
|
|
5
|
+
|
|
6
|
+
# Optional resolver backed by the `kabosu` gem (Ruby bindings for the Sudachi
|
|
7
|
+
# morphological analyzer). Turns a bare word — even an inflected one like
|
|
8
|
+
# "食べている" — into its dictionary form and JMdict part of speech, so you can
|
|
9
|
+
# conjugate without naming the POS yourself:
|
|
10
|
+
#
|
|
11
|
+
# Daidai.conjugate("食べている") # kabosu finds 食べる / v1, then conjugates
|
|
12
|
+
#
|
|
13
|
+
# This needs the `kabosu` gem plus an installed Sudachi dictionary. Neither is
|
|
14
|
+
# a hard dependency of daidai — the rest of the gem is pure Ruby and
|
|
15
|
+
# zero-dependency. The escape hatch is simply to pass the POS, in which case
|
|
16
|
+
# kabosu never runs:
|
|
17
|
+
#
|
|
18
|
+
# Daidai.conjugate("食べる", "v1")
|
|
19
|
+
#
|
|
20
|
+
# NOTE: this module is nested inside Daidai, so the top-level kabosu gem must
|
|
21
|
+
# be referenced as ::Kabosu to avoid resolving back to Daidai::Kabosu.
|
|
22
|
+
module Kabosu
|
|
23
|
+
# Raised when a POS-less conjugation is requested but the `kabosu` gem and a
|
|
24
|
+
# Sudachi dictionary aren't available.
|
|
25
|
+
class MissingDependency < Error; end
|
|
26
|
+
|
|
27
|
+
# Sudachi 活用型 (conjugation type) => JMdict POS code. Sudachi names the verb
|
|
28
|
+
# row but not the JMdict subclass for a handful of irregulars, so LEMMA_POS
|
|
29
|
+
# overrides those by dictionary form.
|
|
30
|
+
CONJUGATION_TYPE = {
|
|
31
|
+
"五段-カ行" => "v5k", "五段-ガ行" => "v5g", "五段-サ行" => "v5s",
|
|
32
|
+
"五段-タ行" => "v5t", "五段-ナ行" => "v5n", "五段-バ行" => "v5b",
|
|
33
|
+
"五段-マ行" => "v5m", "五段-ラ行" => "v5r", "五段-ワア行" => "v5u",
|
|
34
|
+
"カ行変格" => "vk", "サ行変格" => "vs-i"
|
|
35
|
+
}.freeze
|
|
36
|
+
|
|
37
|
+
# Dictionary-form overrides for verbs whose JMdict subclass Sudachi's 活用型
|
|
38
|
+
# can't distinguish (irregular okurigana inside an otherwise-regular row).
|
|
39
|
+
LEMMA_POS = {
|
|
40
|
+
"行く" => "v5k-s", "逝く" => "v5k-s", "往く" => "v5k-s",
|
|
41
|
+
"有る" => "v5r-i", "在る" => "v5r-i", "ある" => "v5r-i"
|
|
42
|
+
}.freeze
|
|
43
|
+
|
|
44
|
+
class << self
|
|
45
|
+
# Resolve `text` to { word:, pos:, reading: } from its first inflecting
|
|
46
|
+
# morpheme, or nil when nothing conjugatable is found. Raises
|
|
47
|
+
# MissingDependency when kabosu/a dictionary isn't installed.
|
|
48
|
+
def resolve(text)
|
|
49
|
+
morphemes = tokenizer.tokenize(text).to_a
|
|
50
|
+
index = morphemes.index { |m| inflecting?(m.part_of_speech) }
|
|
51
|
+
return nil unless index
|
|
52
|
+
|
|
53
|
+
morpheme = morphemes[index]
|
|
54
|
+
preceding = index.positive? ? morphemes[index - 1] : nil
|
|
55
|
+
|
|
56
|
+
# 名詞+する compounds (勉強した → 勉強, vs): the noun is the dictionary entry.
|
|
57
|
+
if suru?(morpheme.part_of_speech) && preceding && suru_noun?(preceding.part_of_speech)
|
|
58
|
+
return entry(preceding, "vs")
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
pos = jmdict_pos(morpheme.part_of_speech, morpheme.dictionary_form)
|
|
62
|
+
pos && entry(morpheme, pos)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Pure mapping: a Sudachi part-of-speech array + dictionary form => JMdict
|
|
66
|
+
# POS code, or nil. Exposed (and unit-tested) without needing kabosu.
|
|
67
|
+
def jmdict_pos(pos, lemma)
|
|
68
|
+
LEMMA_POS[lemma] || from_conjugation_type(pos)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Whether the resolver is usable (kabosu loadable + a dictionary present).
|
|
72
|
+
def available?
|
|
73
|
+
!tokenizer.nil?
|
|
74
|
+
rescue MissingDependency
|
|
75
|
+
false
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def reset! = (@tokenizer = nil)
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
def from_conjugation_type(pos)
|
|
83
|
+
case pos[0]
|
|
84
|
+
when "動詞"
|
|
85
|
+
CONJUGATION_TYPE[pos[4]] || (pos[4].to_s.start_with?("上一段", "下一段") ? "v1" : nil)
|
|
86
|
+
when "形容詞" then "adj-i"
|
|
87
|
+
when "形状詞" then "adj-na"
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def inflecting?(pos)
|
|
92
|
+
%w[動詞 形容詞 形状詞].include?(pos[0])
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def suru?(pos) = pos[4] == "サ行変格"
|
|
96
|
+
def suru_noun?(pos) = pos[0] == "名詞" && pos[2] == "サ変可能"
|
|
97
|
+
|
|
98
|
+
def entry(morpheme, pos)
|
|
99
|
+
{ word: morpheme.dictionary_form, pos: pos, reading: dictionary_reading(morpheme) }
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# `reading_form` is the *surface* reading; it matches the dictionary form
|
|
103
|
+
# only when the input wasn't inflected. Otherwise leave the reading to
|
|
104
|
+
# conjugate — a kana word is its own reading, and a kanji word just omits
|
|
105
|
+
# the kana column rather than carry a wrong one.
|
|
106
|
+
def dictionary_reading(morpheme)
|
|
107
|
+
return nil unless morpheme.surface == morpheme.dictionary_form
|
|
108
|
+
|
|
109
|
+
hiragana(morpheme.reading_form)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def tokenizer
|
|
113
|
+
@tokenizer ||= build_tokenizer
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def build_tokenizer
|
|
117
|
+
require "kabosu"
|
|
118
|
+
::Kabosu::Dictionary.new(system_dict: ::Kabosu::Dictionary.path).create(mode: :c)
|
|
119
|
+
rescue LoadError
|
|
120
|
+
raise MissingDependency,
|
|
121
|
+
'Daidai.conjugate(word) without a POS needs the `kabosu` gem. Add `gem "kabosu"` ' \
|
|
122
|
+
'(and install a Sudachi dictionary), or pass the JMdict POS: Daidai.conjugate(word, "v5k").'
|
|
123
|
+
rescue StandardError => e
|
|
124
|
+
raise MissingDependency,
|
|
125
|
+
"Sudachi is unavailable (#{e.message}). Install a dictionary (e.g. `rake kabosu:install`), " \
|
|
126
|
+
'or pass the JMdict POS explicitly: Daidai.conjugate(word, "v5k").'
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def hiragana(text)
|
|
130
|
+
text.to_s.tr("ァ-ヴ", "ぁ-ゔ")
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|