eiwa 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +35 -27
- data/README.md +22 -13
- data/lib/eiwa.rb +19 -7
- data/lib/eiwa/jmdict/doc.rb +85 -0
- data/lib/eiwa/jmdict/entities.rb +180 -0
- data/lib/eiwa/kanjidic/doc.rb +43 -0
- data/lib/eiwa/parses_file.rb +35 -0
- data/lib/eiwa/tag/antonym.rb +1 -1
- data/lib/eiwa/tag/bag.rb +21 -0
- data/lib/eiwa/tag/character.rb +24 -0
- data/lib/eiwa/tag/cross_reference.rb +1 -1
- data/lib/eiwa/tag/definition.rb +1 -1
- data/lib/eiwa/tag/entity.rb +1 -3
- data/lib/eiwa/tag/entry.rb +0 -2
- data/lib/eiwa/tag/list.rb +18 -0
- data/lib/eiwa/tag/meaning.rb +0 -2
- data/lib/eiwa/tag/other.rb +5 -3
- data/lib/eiwa/tag/reading.rb +0 -2
- data/lib/eiwa/tag/reading_meaning.rb +11 -0
- data/lib/eiwa/tag/source_language.rb +1 -1
- data/lib/eiwa/tag/spelling.rb +0 -2
- data/lib/eiwa/version.rb +1 -1
- metadata +15 -10
- data/lib/eiwa/jmdict_doc.rb +0 -93
- data/lib/eiwa/jmdict_entities.rb +0 -178
- data/lib/eiwa/parses_jmdict_file.rb +0 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ef277fcf117e28dcbc32fc6e17c8d34f1783b546f9adf5b9452e04d073d31ee8
|
4
|
+
data.tar.gz: d1e744a10c6e688e532da9855790ba1a9fad89338b75f0924ee9a5f90294e3ee
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '090773b16ffc636c53cbd957e5f6131449c455bf4e00c39908f78403c2c284bd7364b9e33ea9f6d8ac9a930777cfdc1f67e296d701ee7224fd6572da812f5f93'
|
7
|
+
data.tar.gz: d43f3a9ded86a0a3238eaac28c86b1d3d7b95dbe3d7a7c6eb8fe389014c7393600009f0c79f69cf7b06ff8dcd84650c0fa679524703cd36207b795b994892375
|
data/Gemfile.lock
CHANGED
@@ -1,42 +1,50 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
eiwa (0.0
|
4
|
+
eiwa (0.1.0)
|
5
5
|
nokogiri
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: https://rubygems.org/
|
9
9
|
specs:
|
10
|
-
ast (2.4.
|
11
|
-
coderay (1.1.
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
parallel (1.
|
19
|
-
parser (
|
20
|
-
ast (~> 2.4.
|
21
|
-
pry (0.
|
22
|
-
coderay (~> 1.1
|
23
|
-
method_source (~>
|
10
|
+
ast (2.4.1)
|
11
|
+
coderay (1.1.3)
|
12
|
+
method_source (1.0.0)
|
13
|
+
mini_portile2 (2.5.0)
|
14
|
+
minitest (5.14.3)
|
15
|
+
nokogiri (1.11.1)
|
16
|
+
mini_portile2 (~> 2.5.0)
|
17
|
+
racc (~> 1.4)
|
18
|
+
parallel (1.20.1)
|
19
|
+
parser (3.0.0.0)
|
20
|
+
ast (~> 2.4.1)
|
21
|
+
pry (0.13.1)
|
22
|
+
coderay (~> 1.1)
|
23
|
+
method_source (~> 1.0)
|
24
|
+
racc (1.5.2)
|
24
25
|
rainbow (3.0.0)
|
25
|
-
rake (13.0.
|
26
|
-
|
27
|
-
|
26
|
+
rake (13.0.3)
|
27
|
+
regexp_parser (2.0.3)
|
28
|
+
rexml (3.2.4)
|
29
|
+
rubocop (1.7.0)
|
28
30
|
parallel (~> 1.10)
|
29
|
-
parser (>= 2.
|
31
|
+
parser (>= 2.7.1.5)
|
30
32
|
rainbow (>= 2.2.2, < 4.0)
|
33
|
+
regexp_parser (>= 1.8, < 3.0)
|
34
|
+
rexml
|
35
|
+
rubocop-ast (>= 1.2.0, < 2.0)
|
31
36
|
ruby-progressbar (~> 1.7)
|
32
|
-
unicode-display_width (>= 1.4.0, <
|
33
|
-
rubocop-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
rubocop (
|
38
|
-
|
39
|
-
|
37
|
+
unicode-display_width (>= 1.4.0, < 2.0)
|
38
|
+
rubocop-ast (1.4.0)
|
39
|
+
parser (>= 2.7.1.5)
|
40
|
+
rubocop-performance (1.9.2)
|
41
|
+
rubocop (>= 0.90.0, < 2.0)
|
42
|
+
rubocop-ast (>= 0.4.0)
|
43
|
+
ruby-progressbar (1.11.0)
|
44
|
+
standard (0.11.0)
|
45
|
+
rubocop (= 1.7.0)
|
46
|
+
rubocop-performance (= 1.9.2)
|
47
|
+
unicode-display_width (1.7.0)
|
40
48
|
|
41
49
|
PLATFORMS
|
42
50
|
ruby
|
data/README.md
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
# eiwa / 英和
|
2
2
|
|
3
|
-
Parses
|
4
|
-
|
3
|
+
Parses two types of Japanese-English dictionaries:
|
4
|
+
|
5
|
+
* `:jmdict_e` - [JMDict](http://www.edrdg.org/jmdict/edict_doc.html)'s
|
6
|
+
English-only export of the WWWJDIC online Japanese dictionary.
|
7
|
+
* `:kanjidic2` - the
|
8
|
+
[KANJIDIC2](http://www.edrdg.org/wiki/index.php/KANJIDIC_Project) dictionary
|
9
|
+
of roughly 13,000 kanji characters
|
5
10
|
|
6
11
|
## Usage
|
7
12
|
|
@@ -23,15 +28,24 @@ gem 'eiwa'
|
|
23
28
|
|
24
29
|
Get your hands on a supported dictionary. Right now eiwa only parses
|
25
30
|
[JMDict](http://www.edrdg.org/jmdict/j_jmdict.html), which can be fetched from
|
26
|
-
the [
|
31
|
+
the [EDRDG ftp site](http://ftp.edrdg.org/pub/Nihongo/00INDEX.html) or with a
|
27
32
|
script like this, for the Japanese-English export:
|
28
33
|
|
29
34
|
```bash
|
30
|
-
|
35
|
+
# Download JMDICT-E:
|
36
|
+
$ curl http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz -o jmdict.xml.gz"
|
37
|
+
# Unzip to jmdict.xml
|
38
|
+
$ gunzip jmdict.xml.gz
|
39
|
+
|
40
|
+
# Download KANJIDIC2:
|
41
|
+
$ curl http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -o kanjidic2.xml.gz
|
42
|
+
# Unzip to kanjidic2.xml
|
43
|
+
$ gunzip kanjidic2.xml.gz
|
31
44
|
```
|
32
45
|
|
33
|
-
|
34
|
-
the [WWWJDIC
|
46
|
+
These files are updated daily, and are essentially an export of all vocabulary
|
47
|
+
and kanji in the [WWWJDIC
|
48
|
+
application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
|
35
49
|
|
36
50
|
### Parse the dictionary
|
37
51
|
|
@@ -44,13 +58,11 @@ array and one that will invoke a provided block with each entry, but which won't
|
|
44
58
|
retain a reference to the entries, allowing Ruby to garbage collect them as it
|
45
59
|
goes.
|
46
60
|
|
47
|
-
Parsing the dictionary is CPU intensive, and takes about 13 seconds on my 2019
|
48
|
-
13" MacBook Pro.
|
49
|
-
|
50
61
|
#### Passing a block
|
51
62
|
|
52
63
|
If you just want to do some processing on each entry, it probably makes sense to
|
53
|
-
invoke the library by passing a block
|
64
|
+
invoke the library by passing a block (note that supported types include only
|
65
|
+
`:jmdict_e` and `:kanjidic2`)
|
54
66
|
|
55
67
|
```ruby
|
56
68
|
Eiwa.parse_file("path/to/some.xml", type: :jmdict_e) do |entry|
|
@@ -74,6 +86,3 @@ entries = Eiwa.parse_file("path/to/some.xml", type: :jmdict_e)
|
|
74
86
|
Note that for the abridged Japanese-English dictionary, this will consume about
|
75
87
|
500MB of RAM.
|
76
88
|
|
77
|
-
### The entry object model
|
78
|
-
|
79
|
-
I haven't documented the [Entry](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag/entry.rb) type or its child types yet, but they should be pretty easy to piece together by inspecting the output and [checking the source listings](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag).
|
data/lib/eiwa.rb
CHANGED
@@ -1,15 +1,27 @@
|
|
1
1
|
require "eiwa/version"
|
2
|
-
|
2
|
+
|
3
|
+
require "eiwa/tag/any"
|
4
|
+
require "eiwa/tag/character"
|
5
|
+
require "eiwa/tag/bag"
|
6
|
+
require "eiwa/tag/list"
|
7
|
+
require "eiwa/tag/reading_meaning"
|
8
|
+
require "eiwa/tag/entry"
|
9
|
+
require "eiwa/tag/spelling"
|
10
|
+
require "eiwa/tag/reading"
|
11
|
+
require "eiwa/tag/meaning"
|
12
|
+
require "eiwa/tag/entity"
|
13
|
+
require "eiwa/tag/cross_reference"
|
14
|
+
require "eiwa/tag/antonym"
|
15
|
+
require "eiwa/tag/source_language"
|
16
|
+
require "eiwa/tag/definition"
|
17
|
+
require "eiwa/tag/other"
|
18
|
+
|
19
|
+
require "eiwa/parses_file"
|
3
20
|
|
4
21
|
module Eiwa
|
5
22
|
class Error < StandardError; end
|
6
23
|
|
7
24
|
def self.parse_file(filename, type: :jmdict_e, &each_entry_block)
|
8
|
-
|
9
|
-
when :jmdict_e
|
10
|
-
ParsesJmdictFile.new.call(filename, each_entry_block)
|
11
|
-
else
|
12
|
-
raise Eiwa::Error.new("Unknown file type: #{type}")
|
13
|
-
end
|
25
|
+
ParsesFile.new.call(filename, type, each_entry_block)
|
14
26
|
end
|
15
27
|
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require_relative "entities"
|
2
|
+
|
3
|
+
module Eiwa
|
4
|
+
module Jmdict
|
5
|
+
TAGS = {
|
6
|
+
"entry" => Tag::Entry,
|
7
|
+
"k_ele" => Tag::Spelling,
|
8
|
+
"r_ele" => Tag::Reading,
|
9
|
+
"sense" => Tag::Meaning,
|
10
|
+
"pos" => Tag::Entity,
|
11
|
+
"misc" => Tag::Entity,
|
12
|
+
"dial" => Tag::Entity,
|
13
|
+
"field" => Tag::Entity,
|
14
|
+
"ke_inf" => Tag::Entity,
|
15
|
+
"re_inf" => Tag::Entity,
|
16
|
+
"xref" => Tag::CrossReference,
|
17
|
+
"ant" => Tag::Antonym,
|
18
|
+
"lsource" => Tag::SourceLanguage,
|
19
|
+
"gloss" => Tag::Definition
|
20
|
+
}
|
21
|
+
|
22
|
+
class Doc < Nokogiri::XML::SAX::Document
|
23
|
+
def initialize(each_entry_block)
|
24
|
+
@each_entry_block = each_entry_block
|
25
|
+
@current = nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def start_document
|
29
|
+
end
|
30
|
+
|
31
|
+
def end_document
|
32
|
+
end
|
33
|
+
|
34
|
+
def start_element(name, attrs)
|
35
|
+
parent = @current
|
36
|
+
@current = (TAGS[name] || Tag::Other).new
|
37
|
+
@current.start(name, attrs, parent)
|
38
|
+
end
|
39
|
+
|
40
|
+
def end_element(name)
|
41
|
+
raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
|
42
|
+
ending = @current
|
43
|
+
ending.end_self
|
44
|
+
if ending.is_a?(Tag::Entry)
|
45
|
+
@each_entry_block&.call(ending)
|
46
|
+
end
|
47
|
+
|
48
|
+
@current = ending.parent
|
49
|
+
@current&.end_child(ending)
|
50
|
+
end
|
51
|
+
|
52
|
+
def characters(s)
|
53
|
+
@current.add_characters(s)
|
54
|
+
end
|
55
|
+
|
56
|
+
# def comment string
|
57
|
+
# puts "comment #{string}"
|
58
|
+
# end
|
59
|
+
|
60
|
+
# def warning string
|
61
|
+
# puts "warning #{string}"
|
62
|
+
# end
|
63
|
+
|
64
|
+
def error(msg)
|
65
|
+
if (matches = msg.match(/Entity '(\S+)' not defined/))
|
66
|
+
# See: http://github.com/sparklemotion/nokogiri/issues/1926
|
67
|
+
code = matches[1]
|
68
|
+
@current.set_entity(code, ENTITIES[code])
|
69
|
+
elsif msg == "Detected an entity reference loop\n"
|
70
|
+
# Do nothing and hope this does not matter.
|
71
|
+
else
|
72
|
+
raise Eiwa::Error.new("Parsing error: #{msg}")
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# def cdata_block string
|
77
|
+
# puts "cdata_block #{string}"
|
78
|
+
# end
|
79
|
+
|
80
|
+
# def processing_instruction name, content
|
81
|
+
# puts "processing_instruction #{name}, #{content}"
|
82
|
+
# end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,180 @@
|
|
1
|
+
module Eiwa
|
2
|
+
module Jmdict
|
3
|
+
ENTITIES = {
|
4
|
+
"Buddh" => "Buddhist term",
|
5
|
+
"MA" => "martial arts term",
|
6
|
+
"Shinto" => "Shinto term",
|
7
|
+
"X" => "rude or X-rated term (not displayed in educational software)",
|
8
|
+
"abbr" => "abbreviation",
|
9
|
+
"adj-f" => "noun or verb acting prenominally",
|
10
|
+
"adj-i" => "adjective (keiyoushi)",
|
11
|
+
"adj-ix" => "adjective (keiyoushi) - yoi/ii class",
|
12
|
+
"adj-kari" => "`kari' adjective (archaic)",
|
13
|
+
"adj-ku" => "`ku' adjective (archaic)",
|
14
|
+
"adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
|
15
|
+
"adj-nari" => "archaic/formal form of na-adjective",
|
16
|
+
"adj-no" => "nouns which may take the genitive case particle `no'",
|
17
|
+
"adj-pn" => "pre-noun adjectival (rentaishi)",
|
18
|
+
"adj-shiku" => "`shiku' adjective (archaic)",
|
19
|
+
"adj-t" => "`taru' adjective",
|
20
|
+
"adv" => "adverb (fukushi)",
|
21
|
+
"adv-to" => "adverb taking the `to' particle",
|
22
|
+
"anat" => "anatomical term",
|
23
|
+
"arch" => "archaism",
|
24
|
+
"archit" => "architecture term",
|
25
|
+
"astron" => "astronomy, etc. term",
|
26
|
+
"ateji" => "ateji (phonetic) reading",
|
27
|
+
"aux" => "auxiliary",
|
28
|
+
"aux-adj" => "auxiliary adjective",
|
29
|
+
"aux-v" => "auxiliary verb",
|
30
|
+
"baseb" => "baseball term",
|
31
|
+
"biol" => "biology term",
|
32
|
+
"bot" => "botany term",
|
33
|
+
"bus" => "business term",
|
34
|
+
"chem" => "chemistry term",
|
35
|
+
"chn" => "children's language",
|
36
|
+
"col" => "colloquialism",
|
37
|
+
"comp" => "computer terminology",
|
38
|
+
"conj" => "conjunction",
|
39
|
+
"cop" => "copula",
|
40
|
+
"cop-da" => "copula",
|
41
|
+
"ctr" => "counter",
|
42
|
+
"derog" => "derogatory",
|
43
|
+
"eK" => "exclusively kanji",
|
44
|
+
"econ" => "economics term",
|
45
|
+
"ek" => "exclusively kana",
|
46
|
+
"engr" => "engineering term",
|
47
|
+
"exp" => "expressions (phrases, clauses, etc.)",
|
48
|
+
"fam" => "familiar language",
|
49
|
+
"fem" => "female term or language",
|
50
|
+
"finc" => "finance term",
|
51
|
+
"food" => "food term",
|
52
|
+
"geol" => "geology, etc. term",
|
53
|
+
"geom" => "geometry term",
|
54
|
+
"gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
|
55
|
+
"hob" => "Hokkaido-ben",
|
56
|
+
"hon" => "honorific or respectful (sonkeigo) language",
|
57
|
+
"hum" => "humble (kenjougo) language",
|
58
|
+
"iK" => "word containing irregular kanji usage",
|
59
|
+
"id" => "idiomatic expression",
|
60
|
+
"ik" => "word containing irregular kana usage",
|
61
|
+
"int" => "interjection (kandoushi)",
|
62
|
+
"io" => "irregular okurigana usage",
|
63
|
+
"iv" => "irregular verb",
|
64
|
+
"joc" => "jocular, humorous term",
|
65
|
+
"ksb" => "Kansai-ben",
|
66
|
+
"ktb" => "Kantou-ben",
|
67
|
+
"kyb" => "Kyoto-ben",
|
68
|
+
"kyu" => "Kyuushuu-ben",
|
69
|
+
"law" => "law, etc. term",
|
70
|
+
"ling" => "linguistics terminology",
|
71
|
+
"m-sl" => "manga slang",
|
72
|
+
"mahj" => "mahjong term",
|
73
|
+
"male" => "male term or language",
|
74
|
+
"male-sl" => "male slang",
|
75
|
+
"math" => "mathematics",
|
76
|
+
"med" => "medicine, etc. term",
|
77
|
+
"mil" => "military",
|
78
|
+
"music" => "music term",
|
79
|
+
"n" => "noun (common) (futsuumeishi)",
|
80
|
+
"n-adv" => "adverbial noun (fukushitekimeishi)",
|
81
|
+
"n-pr" => "proper noun",
|
82
|
+
"n-pref" => "noun, used as a prefix",
|
83
|
+
"n-suf" => "noun, used as a suffix",
|
84
|
+
"n-t" => "noun (temporal) (jisoumeishi)",
|
85
|
+
"nab" => "Nagano-ben",
|
86
|
+
"num" => "numeric",
|
87
|
+
"oK" => "word containing out-dated kanji",
|
88
|
+
"obs" => "obsolete term",
|
89
|
+
"obsc" => "obscure term",
|
90
|
+
"oik" => "old or irregular kana form",
|
91
|
+
"ok" => "out-dated or obsolete kana usage",
|
92
|
+
"on-mim" => "onomatopoeic or mimetic word",
|
93
|
+
"osb" => "Osaka-ben",
|
94
|
+
"physics" => "physics terminology",
|
95
|
+
"pn" => "pronoun",
|
96
|
+
"poet" => "poetical term",
|
97
|
+
"pol" => "polite (teineigo) language",
|
98
|
+
"pref" => "prefix",
|
99
|
+
"proverb" => "proverb",
|
100
|
+
"prt" => "particle",
|
101
|
+
"quote" => "quotation",
|
102
|
+
"rare" => "rare",
|
103
|
+
"rkb" => "Ryuukyuu-ben",
|
104
|
+
"sens" => "sensitive",
|
105
|
+
"shogi" => "shogi term",
|
106
|
+
"sl" => "slang",
|
107
|
+
"sports" => "sports term",
|
108
|
+
"suf" => "suffix",
|
109
|
+
"sumo" => "sumo term",
|
110
|
+
"thb" => "Touhoku-ben",
|
111
|
+
"tsb" => "Tosa-ben",
|
112
|
+
"tsug" => "Tsugaru-ben",
|
113
|
+
"uK" => "word usually written using kanji alone",
|
114
|
+
"uk" => "word usually written using kana alone",
|
115
|
+
"unc" => "unclassified",
|
116
|
+
"v-unspec" => "verb unspecified",
|
117
|
+
"v1" => "Ichidan verb",
|
118
|
+
"v1-s" => "Ichidan verb - kureru special class",
|
119
|
+
"v2a-s" => "Nidan verb with 'u' ending (archaic)",
|
120
|
+
"v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
|
121
|
+
"v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
|
122
|
+
"v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
|
123
|
+
"v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
|
124
|
+
"v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
|
125
|
+
"v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
|
126
|
+
"v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
|
127
|
+
"v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
|
128
|
+
"v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
|
129
|
+
"v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
|
130
|
+
"v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
|
131
|
+
"v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
|
132
|
+
"v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
|
133
|
+
"v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
|
134
|
+
"v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
|
135
|
+
"v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
|
136
|
+
"v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
|
137
|
+
"v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
|
138
|
+
"v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
|
139
|
+
"v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
|
140
|
+
"v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
|
141
|
+
"v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
|
142
|
+
"v4b" => "Yodan verb with `bu' ending (archaic)",
|
143
|
+
"v4g" => "Yodan verb with `gu' ending (archaic)",
|
144
|
+
"v4h" => "Yodan verb with `hu/fu' ending (archaic)",
|
145
|
+
"v4k" => "Yodan verb with `ku' ending (archaic)",
|
146
|
+
"v4m" => "Yodan verb with `mu' ending (archaic)",
|
147
|
+
"v4n" => "Yodan verb with `nu' ending (archaic)",
|
148
|
+
"v4r" => "Yodan verb with `ru' ending (archaic)",
|
149
|
+
"v4s" => "Yodan verb with `su' ending (archaic)",
|
150
|
+
"v4t" => "Yodan verb with `tsu' ending (archaic)",
|
151
|
+
"v5aru" => "Godan verb - -aru special class",
|
152
|
+
"v5b" => "Godan verb with `bu' ending",
|
153
|
+
"v5g" => "Godan verb with `gu' ending",
|
154
|
+
"v5k" => "Godan verb with `ku' ending",
|
155
|
+
"v5k-s" => "Godan verb - Iku/Yuku special class",
|
156
|
+
"v5m" => "Godan verb with `mu' ending",
|
157
|
+
"v5n" => "Godan verb with `nu' ending",
|
158
|
+
"v5r" => "Godan verb with `ru' ending",
|
159
|
+
"v5r-i" => "Godan verb with `ru' ending (irregular verb)",
|
160
|
+
"v5s" => "Godan verb with `su' ending",
|
161
|
+
"v5t" => "Godan verb with `tsu' ending",
|
162
|
+
"v5u" => "Godan verb with `u' ending",
|
163
|
+
"v5u-s" => "Godan verb with `u' ending (special class)",
|
164
|
+
"v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
|
165
|
+
"vi" => "intransitive verb",
|
166
|
+
"vk" => "Kuru verb - special class",
|
167
|
+
"vn" => "irregular nu verb",
|
168
|
+
"vr" => "irregular ru verb, plain form ends with -ri",
|
169
|
+
"vs" => "noun or participle which takes the aux. verb suru",
|
170
|
+
"vs-c" => "su verb - precursor to the modern suru",
|
171
|
+
"vs-i" => "suru verb - included",
|
172
|
+
"vs-s" => "suru verb - special class",
|
173
|
+
"vt" => "transitive verb",
|
174
|
+
"vulg" => "vulgar expression or word",
|
175
|
+
"vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
|
176
|
+
"yoji" => "yojijukugo",
|
177
|
+
"zool" => "zoology term"
|
178
|
+
}
|
179
|
+
end
|
180
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Eiwa
|
2
|
+
module Kanjidic
|
3
|
+
TAGS = {
|
4
|
+
"character" => Tag::Character,
|
5
|
+
"misc" => Tag::Bag,
|
6
|
+
"reading_meaning" => Tag::ReadingMeaning,
|
7
|
+
"rmgroup" => Tag::List
|
8
|
+
}
|
9
|
+
|
10
|
+
class Doc < Nokogiri::XML::SAX::Document
|
11
|
+
def initialize(each_entry_block)
|
12
|
+
@each_entry_block = each_entry_block
|
13
|
+
@current = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def start_element(name, attrs)
|
17
|
+
parent = @current
|
18
|
+
@current = (TAGS[name] || Tag::Other).new
|
19
|
+
@current.start(name, attrs, parent)
|
20
|
+
end
|
21
|
+
|
22
|
+
def end_element(name)
|
23
|
+
raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
|
24
|
+
ending = @current
|
25
|
+
ending.end_self
|
26
|
+
if ending.is_a?(Tag::Character)
|
27
|
+
@each_entry_block&.call(ending)
|
28
|
+
end
|
29
|
+
|
30
|
+
@current = ending.parent
|
31
|
+
@current&.end_child(ending)
|
32
|
+
end
|
33
|
+
|
34
|
+
def characters(s)
|
35
|
+
@current.add_characters(s)
|
36
|
+
end
|
37
|
+
|
38
|
+
def error(msg)
|
39
|
+
raise Eiwa::Error.new("Parsing error: #{msg}")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require_relative "jmdict/doc"
|
3
|
+
require_relative "kanjidic/doc"
|
4
|
+
|
5
|
+
module Eiwa
|
6
|
+
class ParsesFile
|
7
|
+
def call(filename, type, each_entry_block)
|
8
|
+
if each_entry_block.nil?
|
9
|
+
entries = []
|
10
|
+
each_entry_block ||= ->(e) { entries << e }
|
11
|
+
end
|
12
|
+
|
13
|
+
doc_for(type).new(each_entry_block).tap do |doc|
|
14
|
+
Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
|
15
|
+
ctx.recovery = true
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
entries
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def doc_for(type)
|
25
|
+
case type
|
26
|
+
when :jmdict_e
|
27
|
+
Jmdict::Doc
|
28
|
+
when :kanjidic2
|
29
|
+
Kanjidic::Doc
|
30
|
+
else
|
31
|
+
raise Eiwa::Error.new("Unknown file type: #{type}")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/eiwa/tag/antonym.rb
CHANGED
data/lib/eiwa/tag/bag.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
module Eiwa
|
2
|
+
module Tag
|
3
|
+
# For simple elements that contain child element_name, value pairs that could plop into a hash nicely
|
4
|
+
class Bag < Any
|
5
|
+
attr_reader :values
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@values = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def [](key)
|
12
|
+
@values[key]
|
13
|
+
end
|
14
|
+
|
15
|
+
def end_child(child)
|
16
|
+
# Don't overwrite, first dupe tends to be authorative one
|
17
|
+
@values[child.tag_name] = child.text unless @values.key?(child.tag_name)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Eiwa
|
2
|
+
module Tag
|
3
|
+
class Character < Any
|
4
|
+
attr_reader :text,
|
5
|
+
:grade, :stroke_count, :freq, :jlpt,
|
6
|
+
:onyomi, :kunyomi, :meanings
|
7
|
+
|
8
|
+
def end_child(child)
|
9
|
+
if child.tag_name == "literal"
|
10
|
+
@text = child.text
|
11
|
+
elsif child.tag_name == "reading_meaning"
|
12
|
+
@onyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_on" }.map(&:text)
|
13
|
+
@kunyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_kun" }.map(&:text)
|
14
|
+
@meanings = child.rmgroup.items.select { |item| item.name == "meaning" && (item.attrs["m_lang"].nil? || item.attrs["m_lang"] == "en") }.map(&:text)
|
15
|
+
elsif child.tag_name == "misc"
|
16
|
+
@grade = child["grade"]&.to_i
|
17
|
+
@stroke_count = child["stroke_count"]&.to_i
|
18
|
+
@freq = child["freq"]&.to_i
|
19
|
+
@jlpt = child["jlpt"]&.to_i
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/eiwa/tag/definition.rb
CHANGED
data/lib/eiwa/tag/entity.rb
CHANGED
data/lib/eiwa/tag/entry.rb
CHANGED
@@ -0,0 +1,18 @@
|
|
1
|
+
module Eiwa
|
2
|
+
module Tag
|
3
|
+
# For containers of lists or repeated elements
|
4
|
+
class List < Any
|
5
|
+
Item = Struct.new(:name, :attrs, :text, keyword_init: true)
|
6
|
+
|
7
|
+
attr_reader :items
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@items = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def end_child(child)
|
14
|
+
@items << Item.new(name: child.tag_name, attrs: child.attrs, text: child.text)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/lib/eiwa/tag/meaning.rb
CHANGED
data/lib/eiwa/tag/other.rb
CHANGED
data/lib/eiwa/tag/reading.rb
CHANGED
data/lib/eiwa/tag/spelling.rb
CHANGED
data/lib/eiwa/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: eiwa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Justin Searls
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -94,7 +94,7 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
-
description:
|
97
|
+
description:
|
98
98
|
email:
|
99
99
|
- searls@gmail.com
|
100
100
|
executables: []
|
@@ -112,18 +112,23 @@ files:
|
|
112
112
|
- bin/setup
|
113
113
|
- eiwa.gemspec
|
114
114
|
- lib/eiwa.rb
|
115
|
-
- lib/eiwa/
|
116
|
-
- lib/eiwa/
|
117
|
-
- lib/eiwa/
|
115
|
+
- lib/eiwa/jmdict/doc.rb
|
116
|
+
- lib/eiwa/jmdict/entities.rb
|
117
|
+
- lib/eiwa/kanjidic/doc.rb
|
118
|
+
- lib/eiwa/parses_file.rb
|
118
119
|
- lib/eiwa/tag/antonym.rb
|
119
120
|
- lib/eiwa/tag/any.rb
|
121
|
+
- lib/eiwa/tag/bag.rb
|
122
|
+
- lib/eiwa/tag/character.rb
|
120
123
|
- lib/eiwa/tag/cross_reference.rb
|
121
124
|
- lib/eiwa/tag/definition.rb
|
122
125
|
- lib/eiwa/tag/entity.rb
|
123
126
|
- lib/eiwa/tag/entry.rb
|
127
|
+
- lib/eiwa/tag/list.rb
|
124
128
|
- lib/eiwa/tag/meaning.rb
|
125
129
|
- lib/eiwa/tag/other.rb
|
126
130
|
- lib/eiwa/tag/reading.rb
|
131
|
+
- lib/eiwa/tag/reading_meaning.rb
|
127
132
|
- lib/eiwa/tag/source_language.rb
|
128
133
|
- lib/eiwa/tag/spelling.rb
|
129
134
|
- lib/eiwa/version.rb
|
@@ -133,7 +138,7 @@ homepage: https://github.com/searls/eiwa
|
|
133
138
|
licenses:
|
134
139
|
- MIT
|
135
140
|
metadata: {}
|
136
|
-
post_install_message:
|
141
|
+
post_install_message:
|
137
142
|
rdoc_options: []
|
138
143
|
require_paths:
|
139
144
|
- lib
|
@@ -148,8 +153,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
148
153
|
- !ruby/object:Gem::Version
|
149
154
|
version: '0'
|
150
155
|
requirements: []
|
151
|
-
rubygems_version: 3.
|
152
|
-
signing_key:
|
156
|
+
rubygems_version: 3.1.4
|
157
|
+
signing_key:
|
153
158
|
specification_version: 4
|
154
159
|
summary: Parses the JMDict Japanese-English dictionary
|
155
160
|
test_files: []
|
data/lib/eiwa/jmdict_doc.rb
DELETED
@@ -1,93 +0,0 @@
|
|
1
|
-
require_relative "tag/entry"
|
2
|
-
require_relative "tag/spelling"
|
3
|
-
require_relative "tag/reading"
|
4
|
-
require_relative "tag/meaning"
|
5
|
-
require_relative "tag/entity"
|
6
|
-
require_relative "tag/cross_reference"
|
7
|
-
require_relative "tag/antonym"
|
8
|
-
require_relative "tag/source_language"
|
9
|
-
require_relative "tag/definition"
|
10
|
-
require_relative "tag/other"
|
11
|
-
|
12
|
-
require_relative "jmdict_entities"
|
13
|
-
|
14
|
-
module Eiwa
|
15
|
-
TAGS = {
|
16
|
-
"entry" => Tag::Entry,
|
17
|
-
"k_ele" => Tag::Spelling,
|
18
|
-
"r_ele" => Tag::Reading,
|
19
|
-
"sense" => Tag::Meaning,
|
20
|
-
"pos" => Tag::Entity,
|
21
|
-
"misc" => Tag::Entity,
|
22
|
-
"dial" => Tag::Entity,
|
23
|
-
"field" => Tag::Entity,
|
24
|
-
"ke_inf" => Tag::Entity,
|
25
|
-
"re_inf" => Tag::Entity,
|
26
|
-
"xref" => Tag::CrossReference,
|
27
|
-
"ant" => Tag::Antonym,
|
28
|
-
"lsource" => Tag::SourceLanguage,
|
29
|
-
"gloss" => Tag::Definition
|
30
|
-
}
|
31
|
-
|
32
|
-
class JmdictDoc < Nokogiri::XML::SAX::Document
|
33
|
-
def initialize(each_entry_block)
|
34
|
-
@each_entry_block = each_entry_block
|
35
|
-
end
|
36
|
-
|
37
|
-
def start_document
|
38
|
-
end
|
39
|
-
|
40
|
-
def end_document
|
41
|
-
end
|
42
|
-
|
43
|
-
def start_element(name, attrs)
|
44
|
-
parent = @current
|
45
|
-
@current = (TAGS[name] || Tag::Other).new
|
46
|
-
@current.start(name, attrs, parent)
|
47
|
-
end
|
48
|
-
|
49
|
-
def end_element(name)
|
50
|
-
raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
|
51
|
-
ending = @current
|
52
|
-
ending.end_self
|
53
|
-
if ending.is_a?(Tag::Entry)
|
54
|
-
@each_entry_block&.call(ending)
|
55
|
-
end
|
56
|
-
|
57
|
-
@current = ending.parent
|
58
|
-
@current&.end_child(ending)
|
59
|
-
end
|
60
|
-
|
61
|
-
def characters(s)
|
62
|
-
@current.add_characters(s)
|
63
|
-
end
|
64
|
-
|
65
|
-
# def comment string
|
66
|
-
# puts "comment #{string}"
|
67
|
-
# end
|
68
|
-
|
69
|
-
# def warning string
|
70
|
-
# puts "warning #{string}"
|
71
|
-
# end
|
72
|
-
|
73
|
-
def error(msg)
|
74
|
-
if (matches = msg.match(/Entity '([\S]+)' not defined/))
|
75
|
-
# See: http://github.com/sparklemotion/nokogiri/issues/1926
|
76
|
-
code = matches[1]
|
77
|
-
@current.set_entity(code, JMDICT_ENTITIES[code])
|
78
|
-
elsif msg == "Detected an entity reference loop\n"
|
79
|
-
# Do nothing and hope this does not matter.
|
80
|
-
else
|
81
|
-
raise Eiwa::Error.new("Parsing error: #{msg}")
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
# def cdata_block string
|
86
|
-
# puts "cdata_block #{string}"
|
87
|
-
# end
|
88
|
-
|
89
|
-
# def processing_instruction name, content
|
90
|
-
# puts "processing_instruction #{name}, #{content}"
|
91
|
-
# end
|
92
|
-
end
|
93
|
-
end
|
data/lib/eiwa/jmdict_entities.rb
DELETED
@@ -1,178 +0,0 @@
|
|
1
|
-
module Eiwa
|
2
|
-
JMDICT_ENTITIES = {
|
3
|
-
"Buddh" => "Buddhist term",
|
4
|
-
"MA" => "martial arts term",
|
5
|
-
"Shinto" => "Shinto term",
|
6
|
-
"X" => "rude or X-rated term (not displayed in educational software)",
|
7
|
-
"abbr" => "abbreviation",
|
8
|
-
"adj-f" => "noun or verb acting prenominally",
|
9
|
-
"adj-i" => "adjective (keiyoushi)",
|
10
|
-
"adj-ix" => "adjective (keiyoushi) - yoi/ii class",
|
11
|
-
"adj-kari" => "`kari' adjective (archaic)",
|
12
|
-
"adj-ku" => "`ku' adjective (archaic)",
|
13
|
-
"adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
|
14
|
-
"adj-nari" => "archaic/formal form of na-adjective",
|
15
|
-
"adj-no" => "nouns which may take the genitive case particle `no'",
|
16
|
-
"adj-pn" => "pre-noun adjectival (rentaishi)",
|
17
|
-
"adj-shiku" => "`shiku' adjective (archaic)",
|
18
|
-
"adj-t" => "`taru' adjective",
|
19
|
-
"adv" => "adverb (fukushi)",
|
20
|
-
"adv-to" => "adverb taking the `to' particle",
|
21
|
-
"anat" => "anatomical term",
|
22
|
-
"arch" => "archaism",
|
23
|
-
"archit" => "architecture term",
|
24
|
-
"astron" => "astronomy, etc. term",
|
25
|
-
"ateji" => "ateji (phonetic) reading",
|
26
|
-
"aux" => "auxiliary",
|
27
|
-
"aux-adj" => "auxiliary adjective",
|
28
|
-
"aux-v" => "auxiliary verb",
|
29
|
-
"baseb" => "baseball term",
|
30
|
-
"biol" => "biology term",
|
31
|
-
"bot" => "botany term",
|
32
|
-
"bus" => "business term",
|
33
|
-
"chem" => "chemistry term",
|
34
|
-
"chn" => "children's language",
|
35
|
-
"col" => "colloquialism",
|
36
|
-
"comp" => "computer terminology",
|
37
|
-
"conj" => "conjunction",
|
38
|
-
"cop" => "copula",
|
39
|
-
"cop-da" => "copula",
|
40
|
-
"ctr" => "counter",
|
41
|
-
"derog" => "derogatory",
|
42
|
-
"eK" => "exclusively kanji",
|
43
|
-
"econ" => "economics term",
|
44
|
-
"ek" => "exclusively kana",
|
45
|
-
"engr" => "engineering term",
|
46
|
-
"exp" => "expressions (phrases, clauses, etc.)",
|
47
|
-
"fam" => "familiar language",
|
48
|
-
"fem" => "female term or language",
|
49
|
-
"finc" => "finance term",
|
50
|
-
"food" => "food term",
|
51
|
-
"geol" => "geology, etc. term",
|
52
|
-
"geom" => "geometry term",
|
53
|
-
"gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
|
54
|
-
"hob" => "Hokkaido-ben",
|
55
|
-
"hon" => "honorific or respectful (sonkeigo) language",
|
56
|
-
"hum" => "humble (kenjougo) language",
|
57
|
-
"iK" => "word containing irregular kanji usage",
|
58
|
-
"id" => "idiomatic expression",
|
59
|
-
"ik" => "word containing irregular kana usage",
|
60
|
-
"int" => "interjection (kandoushi)",
|
61
|
-
"io" => "irregular okurigana usage",
|
62
|
-
"iv" => "irregular verb",
|
63
|
-
"joc" => "jocular, humorous term",
|
64
|
-
"ksb" => "Kansai-ben",
|
65
|
-
"ktb" => "Kantou-ben",
|
66
|
-
"kyb" => "Kyoto-ben",
|
67
|
-
"kyu" => "Kyuushuu-ben",
|
68
|
-
"law" => "law, etc. term",
|
69
|
-
"ling" => "linguistics terminology",
|
70
|
-
"m-sl" => "manga slang",
|
71
|
-
"mahj" => "mahjong term",
|
72
|
-
"male" => "male term or language",
|
73
|
-
"male-sl" => "male slang",
|
74
|
-
"math" => "mathematics",
|
75
|
-
"med" => "medicine, etc. term",
|
76
|
-
"mil" => "military",
|
77
|
-
"music" => "music term",
|
78
|
-
"n" => "noun (common) (futsuumeishi)",
|
79
|
-
"n-adv" => "adverbial noun (fukushitekimeishi)",
|
80
|
-
"n-pr" => "proper noun",
|
81
|
-
"n-pref" => "noun, used as a prefix",
|
82
|
-
"n-suf" => "noun, used as a suffix",
|
83
|
-
"n-t" => "noun (temporal) (jisoumeishi)",
|
84
|
-
"nab" => "Nagano-ben",
|
85
|
-
"num" => "numeric",
|
86
|
-
"oK" => "word containing out-dated kanji",
|
87
|
-
"obs" => "obsolete term",
|
88
|
-
"obsc" => "obscure term",
|
89
|
-
"oik" => "old or irregular kana form",
|
90
|
-
"ok" => "out-dated or obsolete kana usage",
|
91
|
-
"on-mim" => "onomatopoeic or mimetic word",
|
92
|
-
"osb" => "Osaka-ben",
|
93
|
-
"physics" => "physics terminology",
|
94
|
-
"pn" => "pronoun",
|
95
|
-
"poet" => "poetical term",
|
96
|
-
"pol" => "polite (teineigo) language",
|
97
|
-
"pref" => "prefix",
|
98
|
-
"proverb" => "proverb",
|
99
|
-
"prt" => "particle",
|
100
|
-
"quote" => "quotation",
|
101
|
-
"rare" => "rare",
|
102
|
-
"rkb" => "Ryuukyuu-ben",
|
103
|
-
"sens" => "sensitive",
|
104
|
-
"shogi" => "shogi term",
|
105
|
-
"sl" => "slang",
|
106
|
-
"sports" => "sports term",
|
107
|
-
"suf" => "suffix",
|
108
|
-
"sumo" => "sumo term",
|
109
|
-
"thb" => "Touhoku-ben",
|
110
|
-
"tsb" => "Tosa-ben",
|
111
|
-
"tsug" => "Tsugaru-ben",
|
112
|
-
"uK" => "word usually written using kanji alone",
|
113
|
-
"uk" => "word usually written using kana alone",
|
114
|
-
"unc" => "unclassified",
|
115
|
-
"v-unspec" => "verb unspecified",
|
116
|
-
"v1" => "Ichidan verb",
|
117
|
-
"v1-s" => "Ichidan verb - kureru special class",
|
118
|
-
"v2a-s" => "Nidan verb with 'u' ending (archaic)",
|
119
|
-
"v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
|
120
|
-
"v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
|
121
|
-
"v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
|
122
|
-
"v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
|
123
|
-
"v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
|
124
|
-
"v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
|
125
|
-
"v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
|
126
|
-
"v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
|
127
|
-
"v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
|
128
|
-
"v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
|
129
|
-
"v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
|
130
|
-
"v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
|
131
|
-
"v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
|
132
|
-
"v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
|
133
|
-
"v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
|
134
|
-
"v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
|
135
|
-
"v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
|
136
|
-
"v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
|
137
|
-
"v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
|
138
|
-
"v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
|
139
|
-
"v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
|
140
|
-
"v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
|
141
|
-
"v4b" => "Yodan verb with `bu' ending (archaic)",
|
142
|
-
"v4g" => "Yodan verb with `gu' ending (archaic)",
|
143
|
-
"v4h" => "Yodan verb with `hu/fu' ending (archaic)",
|
144
|
-
"v4k" => "Yodan verb with `ku' ending (archaic)",
|
145
|
-
"v4m" => "Yodan verb with `mu' ending (archaic)",
|
146
|
-
"v4n" => "Yodan verb with `nu' ending (archaic)",
|
147
|
-
"v4r" => "Yodan verb with `ru' ending (archaic)",
|
148
|
-
"v4s" => "Yodan verb with `su' ending (archaic)",
|
149
|
-
"v4t" => "Yodan verb with `tsu' ending (archaic)",
|
150
|
-
"v5aru" => "Godan verb - -aru special class",
|
151
|
-
"v5b" => "Godan verb with `bu' ending",
|
152
|
-
"v5g" => "Godan verb with `gu' ending",
|
153
|
-
"v5k" => "Godan verb with `ku' ending",
|
154
|
-
"v5k-s" => "Godan verb - Iku/Yuku special class",
|
155
|
-
"v5m" => "Godan verb with `mu' ending",
|
156
|
-
"v5n" => "Godan verb with `nu' ending",
|
157
|
-
"v5r" => "Godan verb with `ru' ending",
|
158
|
-
"v5r-i" => "Godan verb with `ru' ending (irregular verb)",
|
159
|
-
"v5s" => "Godan verb with `su' ending",
|
160
|
-
"v5t" => "Godan verb with `tsu' ending",
|
161
|
-
"v5u" => "Godan verb with `u' ending",
|
162
|
-
"v5u-s" => "Godan verb with `u' ending (special class)",
|
163
|
-
"v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
|
164
|
-
"vi" => "intransitive verb",
|
165
|
-
"vk" => "Kuru verb - special class",
|
166
|
-
"vn" => "irregular nu verb",
|
167
|
-
"vr" => "irregular ru verb, plain form ends with -ri",
|
168
|
-
"vs" => "noun or participle which takes the aux. verb suru",
|
169
|
-
"vs-c" => "su verb - precursor to the modern suru",
|
170
|
-
"vs-i" => "suru verb - included",
|
171
|
-
"vs-s" => "suru verb - special class",
|
172
|
-
"vt" => "transitive verb",
|
173
|
-
"vulg" => "vulgar expression or word",
|
174
|
-
"vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
|
175
|
-
"yoji" => "yojijukugo",
|
176
|
-
"zool" => "zoology term"
|
177
|
-
}
|
178
|
-
end
|
@@ -1,21 +0,0 @@
|
|
1
|
-
require "nokogiri"
|
2
|
-
require_relative "jmdict_doc"
|
3
|
-
|
4
|
-
module Eiwa
|
5
|
-
class ParsesJmdictFile
|
6
|
-
def call(filename, each_entry_block)
|
7
|
-
if each_entry_block.nil?
|
8
|
-
entries = []
|
9
|
-
each_entry_block ||= ->(e) { entries << e }
|
10
|
-
end
|
11
|
-
|
12
|
-
JmdictDoc.new(each_entry_block).tap do |doc|
|
13
|
-
Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
|
14
|
-
ctx.recovery = true
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
entries
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|