daidai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +31 -0
- data/LICENSE +674 -0
- data/NOTICE +42 -0
- data/README.md +267 -0
- data/lib/daidai/conjugator.rb +122 -0
- data/lib/daidai/deinflector.rb +211 -0
- data/lib/daidai/kabosu.rb +134 -0
- data/lib/daidai/resources/conj.csv +14 -0
- data/lib/daidai/resources/conjo.csv +1138 -0
- data/lib/daidai/resources/conotes.csv +18 -0
- data/lib/daidai/resources/japanese-transforms.json +8847 -0
- data/lib/daidai/resources/kwpos.csv +93 -0
- data/lib/daidai/tables.rb +55 -0
- data/lib/daidai/version.rb +5 -0
- data/lib/daidai/word.rb +134 -0
- data/lib/daidai.rb +75 -0
- metadata +121 -0
data/NOTICE
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
Daidai
|
|
2
|
+
Copyright (c) davafons
|
|
3
|
+
|
|
4
|
+
This product bundles conjugation data and ports the conjugation algorithm from
|
|
5
|
+
the JMdictDB project, which are distributed under the GNU General Public
|
|
6
|
+
License. Because of this lineage, Daidai itself is licensed under the GPL-3.0
|
|
7
|
+
(see LICENSE).
|
|
8
|
+
|
|
9
|
+
-------------------------------------------------------------------------------
|
|
10
|
+
jconj
|
|
11
|
+
-------------------------------------------------------------------------------
|
|
12
|
+
The conjugation tables vendored under lib/daidai/resources/ (conj.csv,
|
|
13
|
+
conjo.csv, conotes.csv, kwpos.csv) are copied verbatim from jconj, and Daidai's
|
|
14
|
+
conjugation algorithm is a faithful port of jconj's.
|
|
15
|
+
|
|
16
|
+
jconj — by Stuart McGraw
|
|
17
|
+
https://gitlab.com/yamagoya/jconj
|
|
18
|
+
Licensed under the GNU General Public License.
|
|
19
|
+
|
|
20
|
+
-------------------------------------------------------------------------------
|
|
21
|
+
Yomitan
|
|
22
|
+
-------------------------------------------------------------------------------
|
|
23
|
+
The deinflector (lib/daidai/deinflector.rb) and its rule set vendored under
|
|
24
|
+
lib/daidai/resources/japanese-transforms.json are ported from Yomitan's
|
|
25
|
+
Japanese language transforms and LanguageTransformer.
|
|
26
|
+
|
|
27
|
+
Yomitan — by the Yomitan Authors
|
|
28
|
+
https://github.com/yomidevs/yomitan
|
|
29
|
+
Licensed under the GNU General Public License v3.0.
|
|
30
|
+
|
|
31
|
+
-------------------------------------------------------------------------------
|
|
32
|
+
JMdict / JMdictDB (EDRDG)
|
|
33
|
+
-------------------------------------------------------------------------------
|
|
34
|
+
The conjugation tables and the part-of-speech taxonomy originate in the
|
|
35
|
+
JMdictDB / JMdict project, maintained by Jim Breen's Electronic Dictionary
|
|
36
|
+
Research and Development Group (EDRDG).
|
|
37
|
+
|
|
38
|
+
JMdict / JMdictDB
|
|
39
|
+
https://www.edrdg.org/
|
|
40
|
+
Used under the EDRDG Licence.
|
|
41
|
+
|
|
42
|
+
Please retain this NOTICE file and the attributions above in any redistribution.
|
data/README.md
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
<h1 align="center">Daidai</h1>
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<a href="https://rubygems.org/gems/daidai"><img src="https://img.shields.io/gem/v/daidai" alt="Gem Version"></a>
|
|
5
|
+
<a href="https://github.com/basecamp/gh-signoff"><img src="https://img.shields.io/badge/CI-signoff-blue" alt="Signoff"></a>
|
|
6
|
+
<a href="https://github.com/davafons/daidai/blob/main/LICENSE"><img src="https://img.shields.io/github/license/davafons/daidai" alt="License"></a>
|
|
7
|
+
<a href="https://rubygems.org/gems/daidai"><img src="https://img.shields.io/gem/dt/daidai" alt="Downloads"></a>
|
|
8
|
+
</p>
|
|
9
|
+
|
|
10
|
+
Pure-Ruby Japanese verb and adjective conjugation. Daidai (橙) is table-driven: all the grammar lives in the conjugation tables from [JMdictDB](https://gitlab.com/yamagoya/jmdictdb) (Jim Breen's [EDRDG](https://www.edrdg.org/)), the same tables that power EDRDG's live conjugator, applied by a faithful Ruby port of [jconj](https://gitlab.com/yamagoya/jconj)'s algorithm. No native extension, no runtime services, just the tables and a small, app-friendly API.
|
|
11
|
+
|
|
12
|
+
```ruby
|
|
13
|
+
verb = Daidai.conjugate("書く", "v5k") # a word + its JMdict part of speech
|
|
14
|
+
|
|
15
|
+
verb.past # => 書いた
|
|
16
|
+
verb.past(polite: true) # => 書きました
|
|
17
|
+
verb.te # => 書いて
|
|
18
|
+
verb.non_past(negative: true) # => 書かない
|
|
19
|
+
verb.polite.negative.past # => 書きませんでした (fluent, and chainable)
|
|
20
|
+
|
|
21
|
+
# Don't know the part of speech? Let kabosu (Sudachi) resolve it, even from an
|
|
22
|
+
# already-inflected word:
|
|
23
|
+
Daidai.conjugate("食べている").word # => "食べる"
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
- Ruby >= 3.1
|
|
29
|
+
|
|
30
|
+
Add to your Gemfile:
|
|
31
|
+
|
|
32
|
+
```ruby
|
|
33
|
+
gem "daidai"
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Then install:
|
|
37
|
+
|
|
38
|
+
```sh
|
|
39
|
+
bundle install
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
The conjugation tables ship vendored inside the gem, so there is nothing to download; conjugation works offline out of the box.
|
|
43
|
+
|
|
44
|
+
## Usage
|
|
45
|
+
|
|
46
|
+
Pass a dictionary-form word and its [JMdict part-of-speech code](https://www.edrdg.org/jmdictdb/cgi-bin/edhelpq.py?svc=jmdict&sid=#kw_pos). `Daidai.conjugate` returns a `Daidai::Word`, or `nil` when nothing is conjugatable.
|
|
47
|
+
|
|
48
|
+
```ruby
|
|
49
|
+
require "daidai"
|
|
50
|
+
|
|
51
|
+
verb = Daidai.conjugate("書く", "v5k") # word + JMdict POS code
|
|
52
|
+
|
|
53
|
+
verb.past # => #<Daidai::Form past: 書いた>
|
|
54
|
+
verb.past.to_s # => "書いた" (Form#to_s, works directly in "#{...}")
|
|
55
|
+
verb.te # => 書いて
|
|
56
|
+
verb.potential # => 書ける
|
|
57
|
+
verb.volitional # => 書こう
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
The `reading` is **optional**: conjugation only ever rewrites the okurigana, which is already in the surface form, so the kanji forms need no reading. Pass one only when you also want each form's kana:
|
|
61
|
+
|
|
62
|
+
```ruby
|
|
63
|
+
verb = Daidai.conjugate("書く", "v5k", reading: "かく")
|
|
64
|
+
verb.past.kanji # => "書いた"
|
|
65
|
+
verb.past.reading # => "かいた"
|
|
66
|
+
|
|
67
|
+
# A kana-only word is its own reading:
|
|
68
|
+
Daidai.conjugate("する", "vs-i").past.to_s # => "した"
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Negative & polite
|
|
72
|
+
|
|
73
|
+
Polarity and formality are named. Use keyword modifiers (canonical) or chainable fluent views (sugar):
|
|
74
|
+
|
|
75
|
+
```ruby
|
|
76
|
+
verb = Daidai.conjugate("書く", "v5k")
|
|
77
|
+
|
|
78
|
+
# keyword modifiers
|
|
79
|
+
verb.non_past(negative: true) # => 書かない
|
|
80
|
+
verb.past(polite: true) # => 書きました
|
|
81
|
+
verb.past(negative: true, polite: true) # => 書きませんでした
|
|
82
|
+
|
|
83
|
+
# fluent views, read like grammar, and chain
|
|
84
|
+
verb.polite.past # => 書きました
|
|
85
|
+
verb.negative.non_past # => 書かない
|
|
86
|
+
verb.polite.negative.non_past # => 書きません
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### The forms
|
|
90
|
+
|
|
91
|
+
A `Daidai::Word` is `Enumerable` and exposes every form **by name**, with no integer ids:
|
|
92
|
+
|
|
93
|
+
```ruby
|
|
94
|
+
Daidai::FORMS.keys
|
|
95
|
+
# => [:non_past, :past, :te, :provisional, :potential, :passive, :causative,
|
|
96
|
+
# :causative_passive, :volitional, :imperative, :conditional, :alternative, :stem]
|
|
97
|
+
|
|
98
|
+
verb.conjugations # => the form names present for this word
|
|
99
|
+
verb.forms # => every Daidai::Form
|
|
100
|
+
verb.each { |form| ... } # Enumerable
|
|
101
|
+
verb[:past, polite: true] # dynamic access (== verb.past(polite: true))
|
|
102
|
+
verb.variants(:te, negative: true) # every accepted variant: 書かなくて, 書かないで
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
A `Daidai::Form`:
|
|
106
|
+
|
|
107
|
+
```ruby
|
|
108
|
+
form = verb.past(polite: true)
|
|
109
|
+
form.to_s # => "書きました" - the kanji form, or the kana if there is no kanji
|
|
110
|
+
form.kanji # => "書きました"
|
|
111
|
+
form.reading # => nil unless a reading was supplied
|
|
112
|
+
form.name # => :past
|
|
113
|
+
form.label # => "Past"
|
|
114
|
+
form.negative? # => false
|
|
115
|
+
form.polite? # => true
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Word classes
|
|
119
|
+
|
|
120
|
+
```ruby
|
|
121
|
+
Daidai.conjugate("食べる", "v1").kind # => :ichidan
|
|
122
|
+
Daidai.conjugate("来る", "vk").kind # => :kuru (来る / くる both handled)
|
|
123
|
+
Daidai.conjugate("高い", "adj-i").kind # => :i_adjective
|
|
124
|
+
Daidai.conjugate("静か", "adj-na").kind # => :na_adjective (conjugated via the copula だ)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Checking conjugatability
|
|
128
|
+
|
|
129
|
+
`Daidai.conjugatable?` takes a single JMdict code or an array, and is true when at least one is conjugatable. Handy for deciding whether to show a conjugation table for a dictionary entry:
|
|
130
|
+
|
|
131
|
+
```ruby
|
|
132
|
+
Daidai.conjugatable?("v5k") # => true
|
|
133
|
+
Daidai.conjugatable?("n") # => false
|
|
134
|
+
Daidai.conjugatable?(["n", "v1"]) # => true - first conjugatable code wins
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
When `pos` is an array, `Daidai.conjugate` likewise picks the first conjugatable code.
|
|
138
|
+
|
|
139
|
+
## Conjugate by word alone (optional)
|
|
140
|
+
|
|
141
|
+
Don't have the part of speech? Omit it, and Daidai uses the optional [`kabosu`](https://github.com/davafons/kabosu) gem (Ruby bindings for the [Sudachi](https://github.com/WorksApplications/sudachi.rs) morphological analyzer) to resolve the dictionary form, POS and reading from any input, **including inflected ones**:
|
|
142
|
+
|
|
143
|
+
```ruby
|
|
144
|
+
Daidai.conjugate("食べている").word # => "食べる" (progressive → its dictionary verb)
|
|
145
|
+
Daidai.conjugate("行った").word # => "行く" (irregular v5k-s, correctly identified)
|
|
146
|
+
Daidai.conjugate("高くない").word # => "高い" (negative adjective → adj-i)
|
|
147
|
+
Daidai.conjugate("勉強した").word # => "勉強" (noun + する → vs)
|
|
148
|
+
Daidai.conjugate("猫") # => nil (not conjugatable)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
This resolves the word so you can conjugate it (forward inflection from a dictionary entry). To go the other way and *name* the inflection ("…is the progressive of…"), see [Deinflection](#deinflection-inflected-form-to-dictionary-form) below. For lemma lookup in a larger app you likely already have a tokenizer; this is a convenience for the conjugation use case.
|
|
152
|
+
|
|
153
|
+
`kabosu` and a Sudachi dictionary are **not** dependencies of Daidai; the gem stays pure Ruby. Add them only if you want this path:
|
|
154
|
+
|
|
155
|
+
```ruby
|
|
156
|
+
# Gemfile
|
|
157
|
+
gem "daidai"
|
|
158
|
+
gem "kabosu"
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
```sh
|
|
162
|
+
bundle exec rake kabosu:install # download a Sudachi dictionary (one-time)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Without them, the POS-less path raises `Daidai::Kabosu::MissingDependency`. The escape hatch is always to pass the POS yourself, and then kabosu never loads:
|
|
166
|
+
|
|
167
|
+
```ruby
|
|
168
|
+
Daidai.conjugate("食べる", "v1") # pure Ruby, no kabosu
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Deinflection (inflected form to dictionary form)
|
|
172
|
+
|
|
173
|
+
`Daidai.deinflect` is the inverse of `conjugate`: give it an inflected surface
|
|
174
|
+
form and it returns the dictionary form(s) it could come from, **naming each
|
|
175
|
+
inflection** along the way. It is pure Ruby and offline, with no Sudachi/kabosu
|
|
176
|
+
needed, and it covers colloquial contractions (てる, ちゃう, とく, …):
|
|
177
|
+
|
|
178
|
+
```ruby
|
|
179
|
+
Daidai.deinflect("食べてる")
|
|
180
|
+
# includes #<Daidai::Deinflection 食べる [-いる, -て]> (the progressive of 食べる)
|
|
181
|
+
|
|
182
|
+
Daidai.deinflect("読まなかった")
|
|
183
|
+
# includes #<Daidai::Deinflection 読む [-た, negative]> (negative past of 読む)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Each result is a `Daidai::Deinflection`:
|
|
187
|
+
|
|
188
|
+
```ruby
|
|
189
|
+
d = Daidai.deinflect("食べてる").find { |x| x.term == "食べる" }
|
|
190
|
+
d.term # => "食べる" (the candidate dictionary form)
|
|
191
|
+
d.inflections # => ["-いる", "-て"] (rule names, surface to dictionary)
|
|
192
|
+
d.dictionary_form? # => true (chain lands on a known dictionary form)
|
|
193
|
+
d.to_s # => "食べる [-いる, -て]"
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Deinflection is rule-based and **dictionary-free**, so it returns *every* base
|
|
197
|
+
form the rules can reach — many of which are not real words (食べてる also yields
|
|
198
|
+
食べつ as a hypothetical potential). It is meant to feed a dictionary lookup: keep
|
|
199
|
+
the candidates whose `term` is a real entry. If you have no dictionary, filtering
|
|
200
|
+
to `dictionary_form?` candidates keeps the plausible lemmas.
|
|
201
|
+
|
|
202
|
+
This pairs naturally with a dictionary like JMdict: deinflect the query, look up
|
|
203
|
+
each candidate `term`, and you have the lemma, its part of speech, and the named
|
|
204
|
+
inflection — without a morphological analyzer. (For a single authoritative lemma
|
|
205
|
+
+ reading from arbitrary text, including full sentences, the kabosu path above is
|
|
206
|
+
still the tool; the two are complementary.)
|
|
207
|
+
|
|
208
|
+
The rule set is ported from [Yomitan](https://github.com/yomidevs/yomitan)'s
|
|
209
|
+
Japanese language transforms and is vendored as JSON under
|
|
210
|
+
`lib/daidai/resources/`. Both Yomitan and Daidai are GPL-3.0; see `NOTICE`.
|
|
211
|
+
|
|
212
|
+
## Data & tables
|
|
213
|
+
|
|
214
|
+
All of the linguistic knowledge lives in four tab-separated tables vendored under `lib/daidai/resources/`, taken from **JMdictDB** (the maintained home of these tables; jconj is the standalone reference implementation Daidai ports):
|
|
215
|
+
|
|
216
|
+
| File | Contents |
|
|
217
|
+
|------|----------|
|
|
218
|
+
| `conj.csv` | conjugation ids and their names |
|
|
219
|
+
| `conjo.csv` | okurigana rules (one per pos / conjugation / negative / polite / variant) |
|
|
220
|
+
| `conotes.csv` | usage notes attached to conjugations |
|
|
221
|
+
| `kwpos.csv` | JMdict part-of-speech keywords and their numeric ids |
|
|
222
|
+
|
|
223
|
+
The conjugator just applies these rules: it drops the citation-form okurigana, applies any euphonic change, and appends the conjugated ending. Nothing is hard-coded per verb, so keeping the tables current keeps the whole gem current. (Japanese conjugation grammar is stable, so these tables change rarely.)
|
|
224
|
+
|
|
225
|
+
Refresh the vendored tables from upstream with:
|
|
226
|
+
|
|
227
|
+
```sh
|
|
228
|
+
rake daidai:sync
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
This downloads the latest `conj.csv`, `conjo.csv`, `conotes.csv`, and `kwpos.csv` from the [JMdictDB repository](https://gitlab.com/yamagoya/jmdictdb) and writes them into `lib/daidai/resources/`. Review the diff before committing. `rake daidai:check_resources` fails if the bundled tables have drifted from upstream.
|
|
232
|
+
|
|
233
|
+
## Data & attribution
|
|
234
|
+
|
|
235
|
+
The conjugation algorithm and tables are not original to Daidai. They come from:
|
|
236
|
+
|
|
237
|
+
- **JMdictDB** (<https://gitlab.com/yamagoya/jmdictdb>), by Stuart McGraw: the actively-maintained home of the conjugation tables and part-of-speech taxonomy, under Jim Breen's **Electronic Dictionary Research and Development Group (EDRDG)**, <https://www.edrdg.org/>.
|
|
238
|
+
- **jconj** (<https://gitlab.com/yamagoya/jconj>): the standalone, table-based conjugator whose algorithm Daidai ports to Ruby.
|
|
239
|
+
|
|
240
|
+
Because the upstream work is GPL-licensed, Daidai inherits that lineage and is distributed under the **GPL-3.0** license. The JMdict/JMdictDB data is used under the EDRDG licence; please retain the attribution above and the `NOTICE` file in any redistribution.
|
|
241
|
+
|
|
242
|
+
## Development
|
|
243
|
+
|
|
244
|
+
```sh
|
|
245
|
+
bundle install
|
|
246
|
+
|
|
247
|
+
bundle exec rake test # Run the test suite
|
|
248
|
+
bundle exec rake lint # RuboCop
|
|
249
|
+
bundle exec rake # lint + test (default)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### Signing off
|
|
253
|
+
|
|
254
|
+
This project uses [gh-signoff](https://github.com/basecamp/gh-signoff) instead of cloud CI: you run the checks locally and sign off on the commit, which sets a `signoff` status check that branch protection requires.
|
|
255
|
+
|
|
256
|
+
```sh
|
|
257
|
+
gh extension install basecamp/gh-signoff # one-time
|
|
258
|
+
bundle exec rake signoff # runs lint + test, signs off ONLY if they pass
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
`rake signoff` makes `lint` and `test` prerequisites, so it won't sign off a red commit. (Running `gh signoff create -f` by hand skips that gate; gh-signoff is trust-based.) The `-f` flag is needed because jj leaves git's HEAD detached.
|
|
262
|
+
|
|
263
|
+
`gh signoff install` configures `main` to require the signoff status. To refresh the vendored conjugation tables from upstream, see `rake daidai:sync` above.
|
|
264
|
+
|
|
265
|
+
## License
|
|
266
|
+
|
|
267
|
+
Daidai is released under the [GPL-3.0](LICENSE) license, in keeping with its JMdictDB / jconj lineage. See `LICENSE` and `NOTICE` for the full terms and upstream attribution.
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "tables"
|
|
4
|
+
require_relative "word"
|
|
5
|
+
|
|
6
|
+
module Daidai
|
|
7
|
+
# Turns a dictionary-form word + its JMdict part-of-speech into the full
|
|
8
|
+
# conjugation paradigm. This is a faithful Ruby port of jconj's table-driven
|
|
9
|
+
# algorithm (Stuart McGraw / EDRDG; GPL) — all the linguistic knowledge lives
|
|
10
|
+
# in the vendored tables, this just applies them.
|
|
11
|
+
module Conjugator
|
|
12
|
+
# The four (negative?, polite?) quadrants every conjugation is generated in.
|
|
13
|
+
QUADRANTS = [ [ false, false ], [ false, true ], [ true, false ], [ true, true ] ].freeze
|
|
14
|
+
|
|
15
|
+
# JMdict codes whose full paradigm lives directly in conjo.csv, mapped to a
|
|
16
|
+
# coarse kind for grouping. Archaic classes (v2*, v4*) and bare nouns are
|
|
17
|
+
# deliberately absent — they simply aren't offered for conjugation.
|
|
18
|
+
DIRECT = {
|
|
19
|
+
"adj-i" => :i_adjective, "adj-ix" => :i_adjective,
|
|
20
|
+
"v1" => :ichidan, "v1-s" => :ichidan,
|
|
21
|
+
"v5aru" => :godan, "v5b" => :godan, "v5g" => :godan, "v5k" => :godan,
|
|
22
|
+
"v5k-s" => :godan, "v5m" => :godan, "v5n" => :godan, "v5r" => :godan,
|
|
23
|
+
"v5r-i" => :godan, "v5s" => :godan, "v5t" => :godan, "v5u" => :godan,
|
|
24
|
+
"v5u-s" => :godan,
|
|
25
|
+
"vk" => :kuru, "vs-i" => :suru, "vs-s" => :suru
|
|
26
|
+
}.freeze
|
|
27
|
+
|
|
28
|
+
COPULA_POS = 15 # the copula だ — na-adjectives conjugate through it
|
|
29
|
+
SURU_POS = 48 # vs-i (する) — `vs` nouns conjugate by appending する
|
|
30
|
+
|
|
31
|
+
class << self
|
|
32
|
+
# True if any of `pos` (a JMdict code or array of codes) can be conjugated.
|
|
33
|
+
def conjugatable?(pos)
|
|
34
|
+
Array(pos).any? { |code| strategy(code.to_s) }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Conjugate `kanji`/`reading` (dictionary forms) according to `pos`. When
|
|
38
|
+
# `pos` is an array the first conjugatable code wins. Returns a Result, or
|
|
39
|
+
# nil when nothing is conjugatable.
|
|
40
|
+
def conjugate(kanji:, reading:, pos:)
|
|
41
|
+
code = Array(pos).map(&:to_s).find { |c| strategy(c) }
|
|
42
|
+
return nil unless code
|
|
43
|
+
|
|
44
|
+
strat = strategy(code)
|
|
45
|
+
kanji = nil if kanji.to_s.empty?
|
|
46
|
+
reading = reading.to_s
|
|
47
|
+
return nil if kanji.nil? && reading.empty?
|
|
48
|
+
|
|
49
|
+
forms = build(strat, kanji, reading)
|
|
50
|
+
forms.empty? ? nil : Word.new(word: kanji || reading, pos: code, kind: strat[:kind], forms: forms)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
private
|
|
54
|
+
|
|
55
|
+
# Walk every (conjugation, quadrant, onum) row defined for this pos and
|
|
56
|
+
# construct the inflected kanji + reading.
|
|
57
|
+
def build(strat, kanji, reading)
|
|
58
|
+
if strat[:append]
|
|
59
|
+
kanji = (kanji || reading) + strat[:append]
|
|
60
|
+
reading += strat[:append]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
Tables.conj.keys.sort.flat_map do |conj_id|
|
|
64
|
+
QUADRANTS.flat_map do |negative, polite|
|
|
65
|
+
(1..9).filter_map do |onum|
|
|
66
|
+
row = Tables.conjo[[ strat[:pos_id], conj_id, negative, polite, onum ]]
|
|
67
|
+
next unless row
|
|
68
|
+
|
|
69
|
+
kf = inflect(strat, kanji, row)
|
|
70
|
+
rf = inflect(strat, reading, row)
|
|
71
|
+
next if kf.nil? && rf.nil?
|
|
72
|
+
|
|
73
|
+
Form.new(name: FORM_BY_ID[conj_id],
|
|
74
|
+
negative: negative, polite: polite, onum: onum,
|
|
75
|
+
kanji: kf, reading: rf)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def inflect(strat, text, row)
|
|
82
|
+
return nil if text.nil? || text.empty?
|
|
83
|
+
|
|
84
|
+
if strat[:suffix]
|
|
85
|
+
# Copula forms are whole suffixes appended to the citation form
|
|
86
|
+
# (静か → 静かだ / 静かではない); no stem stripping or euphony.
|
|
87
|
+
text + row.okuri.to_s
|
|
88
|
+
else
|
|
89
|
+
construct(text, row.stem, row.okuri, row.euphr, row.euphk)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Port of jconj `construct()`: drop `stem` trailing characters (plus one
|
|
94
|
+
# more when a euphonic change applies to this script), then append the
|
|
95
|
+
# euphonic stem char and the okurigana. Whether `text` is treated as kana
|
|
96
|
+
# (euphr) or kanji (euphk) is decided by its next-to-last character — the
|
|
97
|
+
# one that actually inflects.
|
|
98
|
+
def construct(text, stem, okuri, euphr, euphk)
|
|
99
|
+
return nil if text.length < 2
|
|
100
|
+
|
|
101
|
+
kana = text[-2] > "あ" && text[-2] <= "ん"
|
|
102
|
+
euph = kana ? euphr : euphk
|
|
103
|
+
stem += 1 if euph
|
|
104
|
+
cut = text.length - stem
|
|
105
|
+
return nil if cut.negative?
|
|
106
|
+
|
|
107
|
+
text[0, cut] + (euph || "") + okuri.to_s
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# JMdict code => { pos_id:, kind:, [suffix:|append:] }, or nil.
|
|
111
|
+
def strategy(code)
|
|
112
|
+
if (kind = DIRECT[code])
|
|
113
|
+
{ pos_id: Tables.pos_ids.fetch(code), kind: kind }
|
|
114
|
+
elsif code == "adj-na"
|
|
115
|
+
{ pos_id: COPULA_POS, kind: :na_adjective, suffix: true }
|
|
116
|
+
elsif code == "vs"
|
|
117
|
+
{ pos_id: SURU_POS, kind: :suru, append: "する" }
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Daidai
|
|
6
|
+
# A single deinflection candidate: a base-form `term` reached from the input by
|
|
7
|
+
# applying `inflections` (transform names, ordered from the surface form inward
|
|
8
|
+
# to the dictionary form). `dictionary_form?` is true when the rule chain lands
|
|
9
|
+
# on a recognised dictionary form (a likely real lemma) — useful for callers
|
|
10
|
+
# without their own dictionary to look the term up in.
|
|
11
|
+
#
|
|
12
|
+
# Daidai.deinflect("食べてる") # candidate base forms, each with named inflections;
|
|
13
|
+
# # one is #<Daidai::Deinflection 食べる [-いる, -て]>
|
|
14
|
+
Deinflection = Struct.new(:term, :inflections, :conditions, :dictionary_form, keyword_init: true) do
|
|
15
|
+
def dictionary_form? = dictionary_form
|
|
16
|
+
|
|
17
|
+
def to_s = inflections.empty? ? term : "#{term} [#{inflections.join(", ")}]"
|
|
18
|
+
|
|
19
|
+
def inspect = "#<Daidai::Deinflection #{self}>"
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Rule-based Japanese deinflector: turns an inflected surface form back into its
|
|
23
|
+
# dictionary form(s), naming each inflection along the way ("食べてる" is the
|
|
24
|
+
# progressive of "食べる"). This is the inverse of Daidai's forward conjugation.
|
|
25
|
+
#
|
|
26
|
+
# The rule set is ported from Yomitan's Japanese language transforms
|
|
27
|
+
# (ext/js/language/ja/japanese-transforms.js), vendored as JSON under
|
|
28
|
+
# resources/; the algorithm is a port of Yomitan's LanguageTransformer. Both are
|
|
29
|
+
# GPL-3.0 — see NOTICE. Unlike Daidai's forward tables, these rules also cover
|
|
30
|
+
# colloquial contractions (てる, ちゃう, とく, …).
|
|
31
|
+
#
|
|
32
|
+
# Unlike `Daidai.conjugate(word)`, this needs no Sudachi/kabosu — it is pure,
|
|
33
|
+
# offline, string-rule deinflection.
|
|
34
|
+
module Deinflector
|
|
35
|
+
DATA_FILE = File.expand_path("resources/japanese-transforms.json", __dir__)
|
|
36
|
+
|
|
37
|
+
# One deinflection rule: a test for the inflected form and how to undo it.
|
|
38
|
+
Rule = Struct.new(:is_inflected, :deinflect, :conditions_in, :conditions_out, keyword_init: true)
|
|
39
|
+
|
|
40
|
+
# A named group of rules (one grammatical transformation, e.g. "negative").
|
|
41
|
+
Transform = Struct.new(:id, :name, :rules, :heuristic, keyword_init: true)
|
|
42
|
+
|
|
43
|
+
# An intermediate (or final) deinflected form during the search.
|
|
44
|
+
TransformedText = Struct.new(:text, :conditions, :trace, keyword_init: true)
|
|
45
|
+
|
|
46
|
+
class << self
|
|
47
|
+
# Every deinflection candidate for `text`, faithful to the transformer: each
|
|
48
|
+
# term the rules can reach, with its named inflection chain. Excludes the
|
|
49
|
+
# trivial zero-transform identity. Callers with a dictionary look up each
|
|
50
|
+
# `term`; callers without one can keep only `dictionary_form?` candidates.
|
|
51
|
+
def deinflect(text)
|
|
52
|
+
transform(text)
|
|
53
|
+
.reject { |t| t.trace.empty? }
|
|
54
|
+
.map { |t| to_deinflection(t) }
|
|
55
|
+
.uniq { |d| [ d.term, d.inflections ] }
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# The raw transformer output (a TransformedText per reachable form, including
|
|
59
|
+
# the identity). Mirrors Yomitan's LanguageTransformer#transform.
|
|
60
|
+
def transform(source_text)
|
|
61
|
+
results = [ TransformedText.new(text: source_text, conditions: 0, trace: []) ]
|
|
62
|
+
i = 0
|
|
63
|
+
while i < results.length
|
|
64
|
+
current = results[i]
|
|
65
|
+
transforms.each do |transform|
|
|
66
|
+
next unless transform.heuristic.match?(current.text)
|
|
67
|
+
|
|
68
|
+
transform.rules.each_with_index do |rule, j|
|
|
69
|
+
next unless conditions_match?(current.conditions, rule.conditions_in)
|
|
70
|
+
next unless rule.is_inflected.match?(current.text)
|
|
71
|
+
next if cycle?(current.trace, transform.id, j, current.text)
|
|
72
|
+
|
|
73
|
+
results << TransformedText.new(
|
|
74
|
+
text: rule.deinflect.call(current.text),
|
|
75
|
+
conditions: rule.conditions_out,
|
|
76
|
+
trace: [ { transform: transform.id, rule_index: j, text: current.text } ] + current.trace
|
|
77
|
+
)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
i += 1
|
|
81
|
+
end
|
|
82
|
+
results
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def reload!
|
|
86
|
+
@data = @condition_flags = @dictionary_mask = @transforms = @transforms_by_id = nil
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
private
|
|
90
|
+
|
|
91
|
+
def to_deinflection(transformed)
|
|
92
|
+
Deinflection.new(
|
|
93
|
+
term: transformed.text,
|
|
94
|
+
# trace is newest-first (innermost rule first); reverse so the names read
|
|
95
|
+
# from the surface form inward to the dictionary form.
|
|
96
|
+
inflections: transformed.trace.reverse.map { |frame| transforms_by_id[frame[:transform]].name },
|
|
97
|
+
conditions: transformed.conditions,
|
|
98
|
+
dictionary_form: transformed.conditions.anybits?(dictionary_mask)
|
|
99
|
+
)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def conditions_match?(current, following)
|
|
103
|
+
current.zero? || current.anybits?(following)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def cycle?(trace, transform_id, rule_index, text)
|
|
107
|
+
trace.any? { |f| f[:transform] == transform_id && f[:rule_index] == rule_index && f[:text] == text }
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def data
|
|
111
|
+
@data ||= JSON.parse(File.read(DATA_FILE))
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def transforms
|
|
115
|
+
@transforms ||= build_transforms
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def transforms_by_id
|
|
119
|
+
@transforms_by_id ||= transforms.to_h { |t| [ t.id, t ] }
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def build_transforms
|
|
123
|
+
flags = condition_flags
|
|
124
|
+
data["transforms"].map do |id, t|
|
|
125
|
+
rules = t["rules"].map { |r| build_rule(r, flags) }
|
|
126
|
+
heuristic = Regexp.new(rules.map { |r| r.is_inflected.source }.join("|"))
|
|
127
|
+
Transform.new(id: id, name: t["name"], rules: rules, heuristic: heuristic)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Build a rule's matcher + undo closure. The inflected/deinflected fragments
|
|
132
|
+
# are literal kana/kanji (no regex metacharacters), matched verbatim as in
|
|
133
|
+
# Yomitan's helpers.
|
|
134
|
+
def build_rule(rule, flags)
|
|
135
|
+
inflected, deinflected = rule.values_at("inflected", "deinflected")
|
|
136
|
+
is_inflected, deinflect =
|
|
137
|
+
case rule["type"]
|
|
138
|
+
when "suffix"
|
|
139
|
+
[ /#{inflected}$/, ->(text) { text[0...(text.length - inflected.length)] + deinflected } ]
|
|
140
|
+
when "wholeWord"
|
|
141
|
+
[ /\A#{inflected}\z/, ->(_text) { deinflected } ]
|
|
142
|
+
when "prefix"
|
|
143
|
+
[ /\A#{inflected}/, ->(text) { deinflected + text[inflected.length..] } ]
|
|
144
|
+
else
|
|
145
|
+
raise Error, "Unknown deinflection rule type: #{rule["type"]}"
|
|
146
|
+
end
|
|
147
|
+
Rule.new(
|
|
148
|
+
is_inflected: is_inflected,
|
|
149
|
+
deinflect: deinflect,
|
|
150
|
+
conditions_in: strict_flags(flags, rule["conditionsIn"]),
|
|
151
|
+
conditions_out: strict_flags(flags, rule["conditionsOut"])
|
|
152
|
+
)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Bitmask of every dictionary-form condition, for tagging terminal lemmas.
|
|
156
|
+
def dictionary_mask
|
|
157
|
+
@dictionary_mask ||= data["conditions"].sum do |type, c|
|
|
158
|
+
c["isDictionaryForm"] ? condition_flags[type] : 0
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# Assign each leaf condition a distinct bit; a condition with subConditions
|
|
163
|
+
# gets the OR of theirs. Resolved iteratively since a parent may precede its
|
|
164
|
+
# children (a port of LanguageTransformer#_getConditionFlagsMap).
|
|
165
|
+
def condition_flags
|
|
166
|
+
@condition_flags ||= compute_condition_flags
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def compute_condition_flags
|
|
170
|
+
conditions = data["conditions"]
|
|
171
|
+
flags = {}
|
|
172
|
+
next_index = 0
|
|
173
|
+
targets = conditions.keys
|
|
174
|
+
until targets.empty?
|
|
175
|
+
remaining = []
|
|
176
|
+
targets.each do |type|
|
|
177
|
+
sub = conditions[type]["subConditions"]
|
|
178
|
+
if sub.nil?
|
|
179
|
+
raise Error, "Too many deinflection conditions (max 32)" if next_index >= 32
|
|
180
|
+
|
|
181
|
+
flags[type] = 1 << next_index
|
|
182
|
+
next_index += 1
|
|
183
|
+
else
|
|
184
|
+
resolved = strict_flags(flags, sub) { remaining << type }
|
|
185
|
+
flags[type] = resolved if resolved
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
raise Error, "Cycle in deinflection sub-conditions" if remaining.size == targets.size
|
|
189
|
+
|
|
190
|
+
targets = remaining
|
|
191
|
+
end
|
|
192
|
+
flags
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# OR the flags of every named condition. Yields (and returns nil) when one
|
|
196
|
+
# isn't assigned yet, so condition resolution can defer it to a later pass.
|
|
197
|
+
def strict_flags(flags, types)
|
|
198
|
+
result = 0
|
|
199
|
+
types.each do |type|
|
|
200
|
+
flag = flags[type]
|
|
201
|
+
if flag.nil?
|
|
202
|
+
yield if block_given?
|
|
203
|
+
return nil
|
|
204
|
+
end
|
|
205
|
+
result |= flag
|
|
206
|
+
end
|
|
207
|
+
result
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
end
|