eiwa 0.0.2 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +5 -8
- data/.standard.yml +1 -0
- data/Gemfile +5 -1
- data/Gemfile.lock +61 -35
- data/README.md +22 -13
- data/eiwa.gemspec +1 -6
- data/lib/eiwa/jmdict/doc.rb +85 -0
- data/lib/eiwa/jmdict/entities.rb +180 -0
- data/lib/eiwa/kanjidic/doc.rb +43 -0
- data/lib/eiwa/parses_file.rb +35 -0
- data/lib/eiwa/tag/antonym.rb +2 -2
- data/lib/eiwa/tag/any.rb +1 -1
- data/lib/eiwa/tag/bag.rb +21 -0
- data/lib/eiwa/tag/character.rb +24 -0
- data/lib/eiwa/tag/cross_reference.rb +3 -3
- data/lib/eiwa/tag/definition.rb +2 -2
- data/lib/eiwa/tag/entity.rb +2 -4
- data/lib/eiwa/tag/entry.rb +0 -2
- data/lib/eiwa/tag/list.rb +18 -0
- data/lib/eiwa/tag/meaning.rb +0 -2
- data/lib/eiwa/tag/other.rb +5 -3
- data/lib/eiwa/tag/reading.rb +0 -2
- data/lib/eiwa/tag/reading_meaning.rb +11 -0
- data/lib/eiwa/tag/source_language.rb +2 -2
- data/lib/eiwa/tag/spelling.rb +0 -2
- data/lib/eiwa/version.rb +1 -1
- data/lib/eiwa.rb +19 -7
- metadata +19 -83
- data/lib/eiwa/jmdict_doc.rb +0 -93
- data/lib/eiwa/jmdict_entities.rb +0 -178
- data/lib/eiwa/parses_jmdict_file.rb +0 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 87f19acddb018cdf9b99c46b8cd03b38c7ae13d2d006fe31938cf76ebba9da87
|
4
|
+
data.tar.gz: e1049fd3df59e89d3b45998a8bf7fbd0620f29a4ba5908a28df33bf72cbd1bc2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5900ba9dd6094ca0b376f1a7067df65ddd289b6ef74c9b111ed44a28ac07daf7c32c6b92d898b85fc93278718e1e10fb674a90ae51df3b3303a80d4bf6365de2
|
7
|
+
data.tar.gz: ced4f6719aab797bbb5b1ba251f285751ae15982f36f5c296ebf3e6b607feff0350109a284cc8aee768c2cd248d7aa1f81227567ff549961bc4e37ec4193131b
|
data/.github/workflows/ruby.yml
CHANGED
@@ -9,12 +9,9 @@ jobs:
|
|
9
9
|
|
10
10
|
steps:
|
11
11
|
- uses: actions/checkout@v1
|
12
|
-
-
|
13
|
-
uses: actions/setup-ruby@v1
|
12
|
+
- uses: ruby/setup-ruby@v1
|
14
13
|
with:
|
15
|
-
ruby-version:
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
bundle install --jobs 4 --retry 3
|
20
|
-
bundle exec rake
|
14
|
+
ruby-version: '3.3'
|
15
|
+
bundler-cache: true
|
16
|
+
- name: Run tests
|
17
|
+
run: bundle exec rake
|
data/.standard.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ruby_version: 3.0
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,53 +1,79 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
eiwa (0.
|
5
|
-
nokogiri
|
4
|
+
eiwa (0.1.1)
|
5
|
+
nokogiri (~> 1.15.5)
|
6
6
|
|
7
7
|
GEM
|
8
8
|
remote: https://rubygems.org/
|
9
9
|
specs:
|
10
|
-
ast (2.4.
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
10
|
+
ast (2.4.2)
|
11
|
+
json (2.7.1)
|
12
|
+
language_server-protocol (3.17.0.3)
|
13
|
+
lint_roller (1.1.0)
|
14
|
+
m (1.6.2)
|
15
|
+
method_source (>= 0.6.7)
|
16
|
+
rake (>= 0.9.2.2)
|
17
|
+
method_source (1.0.0)
|
18
|
+
mini_portile2 (2.8.5)
|
19
|
+
minitest (5.22.2)
|
20
|
+
nokogiri (1.15.5)
|
21
|
+
mini_portile2 (~> 2.8.2)
|
22
|
+
racc (~> 1.4)
|
23
|
+
parallel (1.24.0)
|
24
|
+
parser (3.3.0.5)
|
25
|
+
ast (~> 2.4.1)
|
26
|
+
racc
|
27
|
+
racc (1.7.3)
|
28
|
+
rainbow (3.1.1)
|
29
|
+
rake (13.1.0)
|
30
|
+
regexp_parser (2.9.0)
|
31
|
+
rexml (3.2.6)
|
32
|
+
rubocop (1.62.1)
|
33
|
+
json (~> 2.3)
|
34
|
+
language_server-protocol (>= 3.17.0)
|
28
35
|
parallel (~> 1.10)
|
29
|
-
parser (>= 2
|
36
|
+
parser (>= 3.3.0.2)
|
30
37
|
rainbow (>= 2.2.2, < 4.0)
|
38
|
+
regexp_parser (>= 1.8, < 3.0)
|
39
|
+
rexml (>= 3.2.5, < 4.0)
|
40
|
+
rubocop-ast (>= 1.31.1, < 2.0)
|
31
41
|
ruby-progressbar (~> 1.7)
|
32
|
-
unicode-display_width (>=
|
33
|
-
rubocop-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
rubocop (
|
38
|
-
|
39
|
-
|
42
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
43
|
+
rubocop-ast (1.31.2)
|
44
|
+
parser (>= 3.3.0.4)
|
45
|
+
rubocop-performance (1.20.2)
|
46
|
+
rubocop (>= 1.48.1, < 2.0)
|
47
|
+
rubocop-ast (>= 1.30.0, < 2.0)
|
48
|
+
ruby-progressbar (1.13.0)
|
49
|
+
standard (1.34.0)
|
50
|
+
language_server-protocol (~> 3.17.0.2)
|
51
|
+
lint_roller (~> 1.0)
|
52
|
+
rubocop (~> 1.60)
|
53
|
+
standard-custom (~> 1.0.0)
|
54
|
+
standard-performance (~> 1.3)
|
55
|
+
standard-custom (1.0.2)
|
56
|
+
lint_roller (~> 1.0)
|
57
|
+
rubocop (~> 1.50)
|
58
|
+
standard-performance (1.3.1)
|
59
|
+
lint_roller (~> 1.1)
|
60
|
+
rubocop-performance (~> 1.20.2)
|
61
|
+
unicode-display_width (2.5.0)
|
40
62
|
|
41
63
|
PLATFORMS
|
42
|
-
|
64
|
+
aarch64-linux
|
65
|
+
arm-linux
|
66
|
+
arm64-darwin
|
67
|
+
x86-linux
|
68
|
+
x86_64-darwin
|
69
|
+
x86_64-linux
|
43
70
|
|
44
71
|
DEPENDENCIES
|
45
|
-
bundler (~> 1.17)
|
46
72
|
eiwa!
|
47
|
-
|
48
|
-
|
49
|
-
rake
|
73
|
+
m
|
74
|
+
minitest
|
75
|
+
rake
|
50
76
|
standard
|
51
77
|
|
52
78
|
BUNDLED WITH
|
53
|
-
|
79
|
+
2.5.4
|
data/README.md
CHANGED
@@ -1,7 +1,12 @@
|
|
1
1
|
# eiwa / 英和
|
2
2
|
|
3
|
-
Parses
|
4
|
-
|
3
|
+
Parses two types of Japanese-English dictionaries:
|
4
|
+
|
5
|
+
* `:jmdict_e` - [JMDict](http://www.edrdg.org/jmdict/edict_doc.html)'s
|
6
|
+
English-only export of the WWWJDIC online Japanese dictionary.
|
7
|
+
* `:kanjidic2` - the
|
8
|
+
[KANJIDIC2](http://www.edrdg.org/wiki/index.php/KANJIDIC_Project) dictionary
|
9
|
+
of roughly 13,000 kanji characters
|
5
10
|
|
6
11
|
## Usage
|
7
12
|
|
@@ -23,15 +28,24 @@ gem 'eiwa'
|
|
23
28
|
|
24
29
|
Get your hands on a supported dictionary. Right now eiwa only parses
|
25
30
|
[JMDict](http://www.edrdg.org/jmdict/j_jmdict.html), which can be fetched from
|
26
|
-
the [
|
31
|
+
the [EDRDG ftp site](http://ftp.edrdg.org/pub/Nihongo/00INDEX.html) or with a
|
27
32
|
script like this, for the Japanese-English export:
|
28
33
|
|
29
34
|
```bash
|
30
|
-
|
35
|
+
# Download JMDICT-E:
|
36
|
+
$ curl http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz -o jmdict.xml.gz"
|
37
|
+
# Unzip to jmdict.xml
|
38
|
+
$ gunzip jmdict.xml.gz
|
39
|
+
|
40
|
+
# Download KANJIDIC2:
|
41
|
+
$ curl http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -o kanjidic2.xml.gz
|
42
|
+
# Unzip to kanjidic2.xml
|
43
|
+
$ gunzip kanjidic2.xml.gz
|
31
44
|
```
|
32
45
|
|
33
|
-
|
34
|
-
the [WWWJDIC
|
46
|
+
These files are updated daily, and are essentially an export of all vocabulary
|
47
|
+
and kanji in the [WWWJDIC
|
48
|
+
application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
|
35
49
|
|
36
50
|
### Parse the dictionary
|
37
51
|
|
@@ -44,13 +58,11 @@ array and one that will invoke a provided block with each entry, but which won't
|
|
44
58
|
retain a reference to the entries, allowing Ruby to garbage collect them as it
|
45
59
|
goes.
|
46
60
|
|
47
|
-
Parsing the dictionary is CPU intensive, and takes about 13 seconds on my 2019
|
48
|
-
13" MacBook Pro.
|
49
|
-
|
50
61
|
#### Passing a block
|
51
62
|
|
52
63
|
If you just want to do some processing on each entry, it probably makes sense to
|
53
|
-
invoke the library by passing a block
|
64
|
+
invoke the library by passing a block (note that supported types include only
|
65
|
+
`:jmdict_e` and `:kanjidic2`)
|
54
66
|
|
55
67
|
```ruby
|
56
68
|
Eiwa.parse_file("path/to/some.xml", type: :jmdict_e) do |entry|
|
@@ -74,6 +86,3 @@ entries = Eiwa.parse_file("path/to/some.xml", type: :jmdict_e)
|
|
74
86
|
Note that for the abridged Japanese-English dictionary, this will consume about
|
75
87
|
500MB of RAM.
|
76
88
|
|
77
|
-
### The entry object model
|
78
|
-
|
79
|
-
I haven't documented the [Entry](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag/entry.rb) type or its child types yet, but they should be pretty easy to piece together by inspecting the output and [checking the source listings](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag).
|
data/eiwa.gemspec
CHANGED
@@ -19,10 +19,5 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
20
|
spec.require_paths = ["lib"]
|
21
21
|
|
22
|
-
spec.add_dependency "nokogiri"
|
23
|
-
spec.add_development_dependency "bundler", "~> 1.17"
|
24
|
-
spec.add_development_dependency "rake", "~> 13.0"
|
25
|
-
spec.add_development_dependency "minitest", "~> 5.0"
|
26
|
-
spec.add_development_dependency "standard"
|
27
|
-
spec.add_development_dependency "pry"
|
22
|
+
spec.add_dependency "nokogiri", "~> 1.15.5"
|
28
23
|
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require_relative "entities"
|
2
|
+
|
3
|
+
module Eiwa
|
4
|
+
module Jmdict
|
5
|
+
TAGS = {
|
6
|
+
"entry" => Tag::Entry,
|
7
|
+
"k_ele" => Tag::Spelling,
|
8
|
+
"r_ele" => Tag::Reading,
|
9
|
+
"sense" => Tag::Meaning,
|
10
|
+
"pos" => Tag::Entity,
|
11
|
+
"misc" => Tag::Entity,
|
12
|
+
"dial" => Tag::Entity,
|
13
|
+
"field" => Tag::Entity,
|
14
|
+
"ke_inf" => Tag::Entity,
|
15
|
+
"re_inf" => Tag::Entity,
|
16
|
+
"xref" => Tag::CrossReference,
|
17
|
+
"ant" => Tag::Antonym,
|
18
|
+
"lsource" => Tag::SourceLanguage,
|
19
|
+
"gloss" => Tag::Definition
|
20
|
+
}
|
21
|
+
|
22
|
+
class Doc < Nokogiri::XML::SAX::Document
|
23
|
+
def initialize(each_entry_block)
|
24
|
+
@each_entry_block = each_entry_block
|
25
|
+
@current = nil
|
26
|
+
end
|
27
|
+
|
28
|
+
def start_document
|
29
|
+
end
|
30
|
+
|
31
|
+
def end_document
|
32
|
+
end
|
33
|
+
|
34
|
+
def start_element(name, attrs)
|
35
|
+
parent = @current
|
36
|
+
@current = (TAGS[name] || Tag::Other).new
|
37
|
+
@current.start(name, attrs, parent)
|
38
|
+
end
|
39
|
+
|
40
|
+
def end_element(name)
|
41
|
+
raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
|
42
|
+
ending = @current
|
43
|
+
ending.end_self
|
44
|
+
if ending.is_a?(Tag::Entry)
|
45
|
+
@each_entry_block&.call(ending)
|
46
|
+
end
|
47
|
+
|
48
|
+
@current = ending.parent
|
49
|
+
@current&.end_child(ending)
|
50
|
+
end
|
51
|
+
|
52
|
+
def characters(s)
|
53
|
+
@current.add_characters(s)
|
54
|
+
end
|
55
|
+
|
56
|
+
# def comment string
|
57
|
+
# puts "comment #{string}"
|
58
|
+
# end
|
59
|
+
|
60
|
+
# def warning string
|
61
|
+
# puts "warning #{string}"
|
62
|
+
# end
|
63
|
+
|
64
|
+
def error(msg)
|
65
|
+
if (matches = msg.match(/Entity '(\S+)' not defined/))
|
66
|
+
# See: http://github.com/sparklemotion/nokogiri/issues/1926
|
67
|
+
code = matches[1]
|
68
|
+
@current.set_entity(code, ENTITIES[code])
|
69
|
+
elsif msg == "Detected an entity reference loop\n"
|
70
|
+
# Do nothing and hope this does not matter.
|
71
|
+
else
|
72
|
+
raise Eiwa::Error.new("Parsing error: #{msg}")
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# def cdata_block string
|
77
|
+
# puts "cdata_block #{string}"
|
78
|
+
# end
|
79
|
+
|
80
|
+
# def processing_instruction name, content
|
81
|
+
# puts "processing_instruction #{name}, #{content}"
|
82
|
+
# end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,180 @@
|
|
1
|
+
module Eiwa
|
2
|
+
module Jmdict
|
3
|
+
ENTITIES = {
|
4
|
+
"Buddh" => "Buddhist term",
|
5
|
+
"MA" => "martial arts term",
|
6
|
+
"Shinto" => "Shinto term",
|
7
|
+
"X" => "rude or X-rated term (not displayed in educational software)",
|
8
|
+
"abbr" => "abbreviation",
|
9
|
+
"adj-f" => "noun or verb acting prenominally",
|
10
|
+
"adj-i" => "adjective (keiyoushi)",
|
11
|
+
"adj-ix" => "adjective (keiyoushi) - yoi/ii class",
|
12
|
+
"adj-kari" => "`kari' adjective (archaic)",
|
13
|
+
"adj-ku" => "`ku' adjective (archaic)",
|
14
|
+
"adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
|
15
|
+
"adj-nari" => "archaic/formal form of na-adjective",
|
16
|
+
"adj-no" => "nouns which may take the genitive case particle `no'",
|
17
|
+
"adj-pn" => "pre-noun adjectival (rentaishi)",
|
18
|
+
"adj-shiku" => "`shiku' adjective (archaic)",
|
19
|
+
"adj-t" => "`taru' adjective",
|
20
|
+
"adv" => "adverb (fukushi)",
|
21
|
+
"adv-to" => "adverb taking the `to' particle",
|
22
|
+
"anat" => "anatomical term",
|
23
|
+
"arch" => "archaism",
|
24
|
+
"archit" => "architecture term",
|
25
|
+
"astron" => "astronomy, etc. term",
|
26
|
+
"ateji" => "ateji (phonetic) reading",
|
27
|
+
"aux" => "auxiliary",
|
28
|
+
"aux-adj" => "auxiliary adjective",
|
29
|
+
"aux-v" => "auxiliary verb",
|
30
|
+
"baseb" => "baseball term",
|
31
|
+
"biol" => "biology term",
|
32
|
+
"bot" => "botany term",
|
33
|
+
"bus" => "business term",
|
34
|
+
"chem" => "chemistry term",
|
35
|
+
"chn" => "children's language",
|
36
|
+
"col" => "colloquialism",
|
37
|
+
"comp" => "computer terminology",
|
38
|
+
"conj" => "conjunction",
|
39
|
+
"cop" => "copula",
|
40
|
+
"cop-da" => "copula",
|
41
|
+
"ctr" => "counter",
|
42
|
+
"derog" => "derogatory",
|
43
|
+
"eK" => "exclusively kanji",
|
44
|
+
"econ" => "economics term",
|
45
|
+
"ek" => "exclusively kana",
|
46
|
+
"engr" => "engineering term",
|
47
|
+
"exp" => "expressions (phrases, clauses, etc.)",
|
48
|
+
"fam" => "familiar language",
|
49
|
+
"fem" => "female term or language",
|
50
|
+
"finc" => "finance term",
|
51
|
+
"food" => "food term",
|
52
|
+
"geol" => "geology, etc. term",
|
53
|
+
"geom" => "geometry term",
|
54
|
+
"gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
|
55
|
+
"hob" => "Hokkaido-ben",
|
56
|
+
"hon" => "honorific or respectful (sonkeigo) language",
|
57
|
+
"hum" => "humble (kenjougo) language",
|
58
|
+
"iK" => "word containing irregular kanji usage",
|
59
|
+
"id" => "idiomatic expression",
|
60
|
+
"ik" => "word containing irregular kana usage",
|
61
|
+
"int" => "interjection (kandoushi)",
|
62
|
+
"io" => "irregular okurigana usage",
|
63
|
+
"iv" => "irregular verb",
|
64
|
+
"joc" => "jocular, humorous term",
|
65
|
+
"ksb" => "Kansai-ben",
|
66
|
+
"ktb" => "Kantou-ben",
|
67
|
+
"kyb" => "Kyoto-ben",
|
68
|
+
"kyu" => "Kyuushuu-ben",
|
69
|
+
"law" => "law, etc. term",
|
70
|
+
"ling" => "linguistics terminology",
|
71
|
+
"m-sl" => "manga slang",
|
72
|
+
"mahj" => "mahjong term",
|
73
|
+
"male" => "male term or language",
|
74
|
+
"male-sl" => "male slang",
|
75
|
+
"math" => "mathematics",
|
76
|
+
"med" => "medicine, etc. term",
|
77
|
+
"mil" => "military",
|
78
|
+
"music" => "music term",
|
79
|
+
"n" => "noun (common) (futsuumeishi)",
|
80
|
+
"n-adv" => "adverbial noun (fukushitekimeishi)",
|
81
|
+
"n-pr" => "proper noun",
|
82
|
+
"n-pref" => "noun, used as a prefix",
|
83
|
+
"n-suf" => "noun, used as a suffix",
|
84
|
+
"n-t" => "noun (temporal) (jisoumeishi)",
|
85
|
+
"nab" => "Nagano-ben",
|
86
|
+
"num" => "numeric",
|
87
|
+
"oK" => "word containing out-dated kanji",
|
88
|
+
"obs" => "obsolete term",
|
89
|
+
"obsc" => "obscure term",
|
90
|
+
"oik" => "old or irregular kana form",
|
91
|
+
"ok" => "out-dated or obsolete kana usage",
|
92
|
+
"on-mim" => "onomatopoeic or mimetic word",
|
93
|
+
"osb" => "Osaka-ben",
|
94
|
+
"physics" => "physics terminology",
|
95
|
+
"pn" => "pronoun",
|
96
|
+
"poet" => "poetical term",
|
97
|
+
"pol" => "polite (teineigo) language",
|
98
|
+
"pref" => "prefix",
|
99
|
+
"proverb" => "proverb",
|
100
|
+
"prt" => "particle",
|
101
|
+
"quote" => "quotation",
|
102
|
+
"rare" => "rare",
|
103
|
+
"rkb" => "Ryuukyuu-ben",
|
104
|
+
"sens" => "sensitive",
|
105
|
+
"shogi" => "shogi term",
|
106
|
+
"sl" => "slang",
|
107
|
+
"sports" => "sports term",
|
108
|
+
"suf" => "suffix",
|
109
|
+
"sumo" => "sumo term",
|
110
|
+
"thb" => "Touhoku-ben",
|
111
|
+
"tsb" => "Tosa-ben",
|
112
|
+
"tsug" => "Tsugaru-ben",
|
113
|
+
"uK" => "word usually written using kanji alone",
|
114
|
+
"uk" => "word usually written using kana alone",
|
115
|
+
"unc" => "unclassified",
|
116
|
+
"v-unspec" => "verb unspecified",
|
117
|
+
"v1" => "Ichidan verb",
|
118
|
+
"v1-s" => "Ichidan verb - kureru special class",
|
119
|
+
"v2a-s" => "Nidan verb with 'u' ending (archaic)",
|
120
|
+
"v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
|
121
|
+
"v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
|
122
|
+
"v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
|
123
|
+
"v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
|
124
|
+
"v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
|
125
|
+
"v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
|
126
|
+
"v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
|
127
|
+
"v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
|
128
|
+
"v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
|
129
|
+
"v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
|
130
|
+
"v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
|
131
|
+
"v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
|
132
|
+
"v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
|
133
|
+
"v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
|
134
|
+
"v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
|
135
|
+
"v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
|
136
|
+
"v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
|
137
|
+
"v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
|
138
|
+
"v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
|
139
|
+
"v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
|
140
|
+
"v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
|
141
|
+
"v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
|
142
|
+
"v4b" => "Yodan verb with `bu' ending (archaic)",
|
143
|
+
"v4g" => "Yodan verb with `gu' ending (archaic)",
|
144
|
+
"v4h" => "Yodan verb with `hu/fu' ending (archaic)",
|
145
|
+
"v4k" => "Yodan verb with `ku' ending (archaic)",
|
146
|
+
"v4m" => "Yodan verb with `mu' ending (archaic)",
|
147
|
+
"v4n" => "Yodan verb with `nu' ending (archaic)",
|
148
|
+
"v4r" => "Yodan verb with `ru' ending (archaic)",
|
149
|
+
"v4s" => "Yodan verb with `su' ending (archaic)",
|
150
|
+
"v4t" => "Yodan verb with `tsu' ending (archaic)",
|
151
|
+
"v5aru" => "Godan verb - -aru special class",
|
152
|
+
"v5b" => "Godan verb with `bu' ending",
|
153
|
+
"v5g" => "Godan verb with `gu' ending",
|
154
|
+
"v5k" => "Godan verb with `ku' ending",
|
155
|
+
"v5k-s" => "Godan verb - Iku/Yuku special class",
|
156
|
+
"v5m" => "Godan verb with `mu' ending",
|
157
|
+
"v5n" => "Godan verb with `nu' ending",
|
158
|
+
"v5r" => "Godan verb with `ru' ending",
|
159
|
+
"v5r-i" => "Godan verb with `ru' ending (irregular verb)",
|
160
|
+
"v5s" => "Godan verb with `su' ending",
|
161
|
+
"v5t" => "Godan verb with `tsu' ending",
|
162
|
+
"v5u" => "Godan verb with `u' ending",
|
163
|
+
"v5u-s" => "Godan verb with `u' ending (special class)",
|
164
|
+
"v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
|
165
|
+
"vi" => "intransitive verb",
|
166
|
+
"vk" => "Kuru verb - special class",
|
167
|
+
"vn" => "irregular nu verb",
|
168
|
+
"vr" => "irregular ru verb, plain form ends with -ri",
|
169
|
+
"vs" => "noun or participle which takes the aux. verb suru",
|
170
|
+
"vs-c" => "su verb - precursor to the modern suru",
|
171
|
+
"vs-i" => "suru verb - included",
|
172
|
+
"vs-s" => "suru verb - special class",
|
173
|
+
"vt" => "transitive verb",
|
174
|
+
"vulg" => "vulgar expression or word",
|
175
|
+
"vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
|
176
|
+
"yoji" => "yojijukugo",
|
177
|
+
"zool" => "zoology term"
|
178
|
+
}
|
179
|
+
end
|
180
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Eiwa
|
2
|
+
module Kanjidic
|
3
|
+
TAGS = {
|
4
|
+
"character" => Tag::Character,
|
5
|
+
"misc" => Tag::Bag,
|
6
|
+
"reading_meaning" => Tag::ReadingMeaning,
|
7
|
+
"rmgroup" => Tag::List
|
8
|
+
}
|
9
|
+
|
10
|
+
class Doc < Nokogiri::XML::SAX::Document
|
11
|
+
def initialize(each_entry_block)
|
12
|
+
@each_entry_block = each_entry_block
|
13
|
+
@current = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def start_element(name, attrs)
|
17
|
+
parent = @current
|
18
|
+
@current = (TAGS[name] || Tag::Other).new
|
19
|
+
@current.start(name, attrs, parent)
|
20
|
+
end
|
21
|
+
|
22
|
+
def end_element(name)
|
23
|
+
raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
|
24
|
+
ending = @current
|
25
|
+
ending.end_self
|
26
|
+
if ending.is_a?(Tag::Character)
|
27
|
+
@each_entry_block&.call(ending)
|
28
|
+
end
|
29
|
+
|
30
|
+
@current = ending.parent
|
31
|
+
@current&.end_child(ending)
|
32
|
+
end
|
33
|
+
|
34
|
+
def characters(s)
|
35
|
+
@current.add_characters(s)
|
36
|
+
end
|
37
|
+
|
38
|
+
def error(msg)
|
39
|
+
raise Eiwa::Error.new("Parsing error: #{msg}")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require_relative "jmdict/doc"
|
3
|
+
require_relative "kanjidic/doc"
|
4
|
+
|
5
|
+
module Eiwa
|
6
|
+
class ParsesFile
|
7
|
+
def call(filename, type, each_entry_block)
|
8
|
+
if each_entry_block.nil?
|
9
|
+
entries = []
|
10
|
+
each_entry_block ||= ->(e) { entries << e }
|
11
|
+
end
|
12
|
+
|
13
|
+
doc_for(type).new(each_entry_block).tap do |doc|
|
14
|
+
Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
|
15
|
+
ctx.recovery = true
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
entries
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def doc_for(type)
|
25
|
+
case type
|
26
|
+
when :jmdict_e
|
27
|
+
Jmdict::Doc
|
28
|
+
when :kanjidic2
|
29
|
+
Kanjidic::Doc
|
30
|
+
else
|
31
|
+
raise Eiwa::Error.new("Unknown file type: #{type}")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
data/lib/eiwa/tag/antonym.rb
CHANGED
data/lib/eiwa/tag/any.rb
CHANGED
data/lib/eiwa/tag/bag.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
module Eiwa
|
2
|
+
module Tag
|
3
|
+
# For simple elements that contain child element_name, value pairs that could plop into a hash nicely
|
4
|
+
class Bag < Any
|
5
|
+
attr_reader :values
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@values = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def [](key)
|
12
|
+
@values[key]
|
13
|
+
end
|
14
|
+
|
15
|
+
def end_child(child)
|
16
|
+
# Don't overwrite, first dupe tends to be authorative one
|
17
|
+
@values[child.tag_name] = child.text unless @values.key?(child.tag_name)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Eiwa
|
2
|
+
module Tag
|
3
|
+
class Character < Any
|
4
|
+
attr_reader :text,
|
5
|
+
:grade, :stroke_count, :freq, :jlpt,
|
6
|
+
:onyomi, :kunyomi, :meanings
|
7
|
+
|
8
|
+
def end_child(child)
|
9
|
+
if child.tag_name == "literal"
|
10
|
+
@text = child.text
|
11
|
+
elsif child.tag_name == "reading_meaning"
|
12
|
+
@onyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_on" }.map(&:text)
|
13
|
+
@kunyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_kun" }.map(&:text)
|
14
|
+
@meanings = child.rmgroup.items.select { |item| item.name == "meaning" && (item.attrs["m_lang"].nil? || item.attrs["m_lang"] == "en") }.map(&:text)
|
15
|
+
elsif child.tag_name == "misc"
|
16
|
+
@grade = child["grade"]&.to_i
|
17
|
+
@stroke_count = child["stroke_count"]&.to_i
|
18
|
+
@freq = child["freq"]&.to_i
|
19
|
+
@jlpt = child["jlpt"]&.to_i
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|