eiwa 0.0.2 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c5888e4802408cc8efdb55ddadfb560ec38d10971fee95b20dd53af2f31f487c
4
- data.tar.gz: 93b7101b430ee123a905065f87e5d8ba336e1a18f145c7801daeb2b9b9a5ba72
3
+ metadata.gz: 87f19acddb018cdf9b99c46b8cd03b38c7ae13d2d006fe31938cf76ebba9da87
4
+ data.tar.gz: e1049fd3df59e89d3b45998a8bf7fbd0620f29a4ba5908a28df33bf72cbd1bc2
5
5
  SHA512:
6
- metadata.gz: 64faccd9958b9c359fcd7a7ff40de013bd1060bbf9a59735be4d52c824dd3e6e77abbb81ab42ae037b9175990dd15baa33722503b47d8aaebbb037a8a303f965
7
- data.tar.gz: 209efe931acfa8563ea1819f4e9b5a7d07a996d1545458b139de14f82971b27f47e0da726ed5151b3922aa05f6c5809bd3196269e0f6d254cb87e801c81ffe6e
6
+ metadata.gz: 5900ba9dd6094ca0b376f1a7067df65ddd289b6ef74c9b111ed44a28ac07daf7c32c6b92d898b85fc93278718e1e10fb674a90ae51df3b3303a80d4bf6365de2
7
+ data.tar.gz: ced4f6719aab797bbb5b1ba251f285751ae15982f36f5c296ebf3e6b607feff0350109a284cc8aee768c2cd248d7aa1f81227567ff549961bc4e37ec4193131b
@@ -9,12 +9,9 @@ jobs:
9
9
 
10
10
  steps:
11
11
  - uses: actions/checkout@v1
12
- - name: Set up Ruby 2.6
13
- uses: actions/setup-ruby@v1
12
+ - uses: ruby/setup-ruby@v1
14
13
  with:
15
- ruby-version: 2.6.x
16
- - name: Build and test with Rake
17
- run: |
18
- gem install bundler
19
- bundle install --jobs 4 --retry 3
20
- bundle exec rake
14
+ ruby-version: '3.3'
15
+ bundler-cache: true
16
+ - name: Run tests
17
+ run: bundle exec rake
data/.standard.yml ADDED
@@ -0,0 +1 @@
1
+ ruby_version: 3.0
data/Gemfile CHANGED
@@ -2,5 +2,9 @@ source "https://rubygems.org"
2
2
 
3
3
  git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
4
 
5
- # Specify your gem's dependencies in eiwa.gemspec
6
5
  gemspec
6
+
7
+ gem "standard"
8
+ gem "minitest"
9
+ gem "rake"
10
+ gem "m"
data/Gemfile.lock CHANGED
@@ -1,53 +1,79 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- eiwa (0.0.2)
5
- nokogiri
4
+ eiwa (0.1.1)
5
+ nokogiri (~> 1.15.5)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
- ast (2.4.0)
11
- coderay (1.1.2)
12
- jaro_winkler (1.5.3)
13
- method_source (0.9.2)
14
- mini_portile2 (2.4.0)
15
- minitest (5.11.3)
16
- nokogiri (1.10.9)
17
- mini_portile2 (~> 2.4.0)
18
- parallel (1.17.0)
19
- parser (2.6.4.1)
20
- ast (~> 2.4.0)
21
- pry (0.12.2)
22
- coderay (~> 1.1.0)
23
- method_source (~> 0.9.0)
24
- rainbow (3.0.0)
25
- rake (13.0.1)
26
- rubocop (0.72.0)
27
- jaro_winkler (~> 1.5.1)
10
+ ast (2.4.2)
11
+ json (2.7.1)
12
+ language_server-protocol (3.17.0.3)
13
+ lint_roller (1.1.0)
14
+ m (1.6.2)
15
+ method_source (>= 0.6.7)
16
+ rake (>= 0.9.2.2)
17
+ method_source (1.0.0)
18
+ mini_portile2 (2.8.5)
19
+ minitest (5.22.2)
20
+ nokogiri (1.15.5)
21
+ mini_portile2 (~> 2.8.2)
22
+ racc (~> 1.4)
23
+ parallel (1.24.0)
24
+ parser (3.3.0.5)
25
+ ast (~> 2.4.1)
26
+ racc
27
+ racc (1.7.3)
28
+ rainbow (3.1.1)
29
+ rake (13.1.0)
30
+ regexp_parser (2.9.0)
31
+ rexml (3.2.6)
32
+ rubocop (1.62.1)
33
+ json (~> 2.3)
34
+ language_server-protocol (>= 3.17.0)
28
35
  parallel (~> 1.10)
29
- parser (>= 2.6)
36
+ parser (>= 3.3.0.2)
30
37
  rainbow (>= 2.2.2, < 4.0)
38
+ regexp_parser (>= 1.8, < 3.0)
39
+ rexml (>= 3.2.5, < 4.0)
40
+ rubocop-ast (>= 1.31.1, < 2.0)
31
41
  ruby-progressbar (~> 1.7)
32
- unicode-display_width (>= 1.4.0, < 1.7)
33
- rubocop-performance (1.4.1)
34
- rubocop (>= 0.71.0)
35
- ruby-progressbar (1.10.1)
36
- standard (0.1.4)
37
- rubocop (~> 0.72.0)
38
- rubocop-performance (~> 1.4.0)
39
- unicode-display_width (1.6.0)
42
+ unicode-display_width (>= 2.4.0, < 3.0)
43
+ rubocop-ast (1.31.2)
44
+ parser (>= 3.3.0.4)
45
+ rubocop-performance (1.20.2)
46
+ rubocop (>= 1.48.1, < 2.0)
47
+ rubocop-ast (>= 1.30.0, < 2.0)
48
+ ruby-progressbar (1.13.0)
49
+ standard (1.34.0)
50
+ language_server-protocol (~> 3.17.0.2)
51
+ lint_roller (~> 1.0)
52
+ rubocop (~> 1.60)
53
+ standard-custom (~> 1.0.0)
54
+ standard-performance (~> 1.3)
55
+ standard-custom (1.0.2)
56
+ lint_roller (~> 1.0)
57
+ rubocop (~> 1.50)
58
+ standard-performance (1.3.1)
59
+ lint_roller (~> 1.1)
60
+ rubocop-performance (~> 1.20.2)
61
+ unicode-display_width (2.5.0)
40
62
 
41
63
  PLATFORMS
42
- ruby
64
+ aarch64-linux
65
+ arm-linux
66
+ arm64-darwin
67
+ x86-linux
68
+ x86_64-darwin
69
+ x86_64-linux
43
70
 
44
71
  DEPENDENCIES
45
- bundler (~> 1.17)
46
72
  eiwa!
47
- minitest (~> 5.0)
48
- pry
49
- rake (~> 13.0)
73
+ m
74
+ minitest
75
+ rake
50
76
  standard
51
77
 
52
78
  BUNDLED WITH
53
- 1.17.3
79
+ 2.5.4
data/README.md CHANGED
@@ -1,7 +1,12 @@
1
1
  # eiwa / 英和
2
2
 
3
- Parses the Japanese-English version of JMDict, a daily export of the WWWJDIC
4
- online Japanese dictionary.
3
+ Parses two types of Japanese-English dictionaries:
4
+
5
+ * `:jmdict_e` - [JMDict](http://www.edrdg.org/jmdict/edict_doc.html)'s
6
+ English-only export of the WWWJDIC online Japanese dictionary.
7
+ * `:kanjidic2` - the
8
+ [KANJIDIC2](http://www.edrdg.org/wiki/index.php/KANJIDIC_Project) dictionary
9
+ of roughly 13,000 kanji characters
5
10
 
6
11
  ## Usage
7
12
 
@@ -23,15 +28,24 @@ gem 'eiwa'
23
28
 
24
29
  Get your hands on a supported dictionary. Right now eiwa only parses
25
30
  [JMDict](http://www.edrdg.org/jmdict/j_jmdict.html), which can be fetched from
26
- the [Monash ftp site](http://ftp.monash.edu/pub/nihongo/00INDEX.html) or with a
31
+ the [EDRDG ftp site](http://ftp.edrdg.org/pub/Nihongo/00INDEX.html) or with a
27
32
  script like this, for the Japanese-English export:
28
33
 
29
34
  ```bash
30
- curl http://ftp.monash.edu/pub/nihongo/JMdict_e -o jmdict.xml
35
+ # Download JMDICT-E:
36
+ $ curl http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz -o jmdict.xml.gz"
37
+ # Unzip to jmdict.xml
38
+ $ gunzip jmdict.xml.gz
39
+
40
+ # Download KANJIDIC2:
41
+ $ curl http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -o kanjidic2.xml.gz
42
+ # Unzip to kanjidic2.xml
43
+ $ gunzip kanjidic2.xml.gz
31
44
  ```
32
45
 
33
- This file is updated daily, and is essentially an export of all vocabulary on
34
- the [WWWJDIC application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
46
+ These files are updated daily, and are essentially an export of all vocabulary
47
+ and kanji in the [WWWJDIC
48
+ application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
35
49
 
36
50
  ### Parse the dictionary
37
51
 
@@ -44,13 +58,11 @@ array and one that will invoke a provided block with each entry, but which won't
44
58
  retain a reference to the entries, allowing Ruby to garbage collect them as it
45
59
  goes.
46
60
 
47
- Parsing the dictionary is CPU intensive, and takes about 13 seconds on my 2019
48
- 13" MacBook Pro.
49
-
50
61
  #### Passing a block
51
62
 
52
63
  If you just want to do some processing on each entry, it probably makes sense to
53
- invoke the library by passing a block
64
+ invoke the library by passing a block (note that supported types include only
65
+ `:jmdict_e` and `:kanjidic2`)
54
66
 
55
67
  ```ruby
56
68
  Eiwa.parse_file("path/to/some.xml", type: :jmdict_e) do |entry|
@@ -74,6 +86,3 @@ entries = Eiwa.parse_file("path/to/some.xml", type: :jmdict_e)
74
86
  Note that for the abridged Japanese-English dictionary, this will consume about
75
87
  500MB of RAM.
76
88
 
77
- ### The entry object model
78
-
79
- I haven't documented the [Entry](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag/entry.rb) type or its child types yet, but they should be pretty easy to piece together by inspecting the output and [checking the source listings](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag).
data/eiwa.gemspec CHANGED
@@ -19,10 +19,5 @@ Gem::Specification.new do |spec|
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
20
  spec.require_paths = ["lib"]
21
21
 
22
- spec.add_dependency "nokogiri"
23
- spec.add_development_dependency "bundler", "~> 1.17"
24
- spec.add_development_dependency "rake", "~> 13.0"
25
- spec.add_development_dependency "minitest", "~> 5.0"
26
- spec.add_development_dependency "standard"
27
- spec.add_development_dependency "pry"
22
+ spec.add_dependency "nokogiri", "~> 1.15.5"
28
23
  end
@@ -0,0 +1,85 @@
1
+ require_relative "entities"
2
+
3
+ module Eiwa
4
+ module Jmdict
5
+ TAGS = {
6
+ "entry" => Tag::Entry,
7
+ "k_ele" => Tag::Spelling,
8
+ "r_ele" => Tag::Reading,
9
+ "sense" => Tag::Meaning,
10
+ "pos" => Tag::Entity,
11
+ "misc" => Tag::Entity,
12
+ "dial" => Tag::Entity,
13
+ "field" => Tag::Entity,
14
+ "ke_inf" => Tag::Entity,
15
+ "re_inf" => Tag::Entity,
16
+ "xref" => Tag::CrossReference,
17
+ "ant" => Tag::Antonym,
18
+ "lsource" => Tag::SourceLanguage,
19
+ "gloss" => Tag::Definition
20
+ }
21
+
22
+ class Doc < Nokogiri::XML::SAX::Document
23
+ def initialize(each_entry_block)
24
+ @each_entry_block = each_entry_block
25
+ @current = nil
26
+ end
27
+
28
+ def start_document
29
+ end
30
+
31
+ def end_document
32
+ end
33
+
34
+ def start_element(name, attrs)
35
+ parent = @current
36
+ @current = (TAGS[name] || Tag::Other).new
37
+ @current.start(name, attrs, parent)
38
+ end
39
+
40
+ def end_element(name)
41
+ raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
42
+ ending = @current
43
+ ending.end_self
44
+ if ending.is_a?(Tag::Entry)
45
+ @each_entry_block&.call(ending)
46
+ end
47
+
48
+ @current = ending.parent
49
+ @current&.end_child(ending)
50
+ end
51
+
52
+ def characters(s)
53
+ @current.add_characters(s)
54
+ end
55
+
56
+ # def comment string
57
+ # puts "comment #{string}"
58
+ # end
59
+
60
+ # def warning string
61
+ # puts "warning #{string}"
62
+ # end
63
+
64
+ def error(msg)
65
+ if (matches = msg.match(/Entity '(\S+)' not defined/))
66
+ # See: http://github.com/sparklemotion/nokogiri/issues/1926
67
+ code = matches[1]
68
+ @current.set_entity(code, ENTITIES[code])
69
+ elsif msg == "Detected an entity reference loop\n"
70
+ # Do nothing and hope this does not matter.
71
+ else
72
+ raise Eiwa::Error.new("Parsing error: #{msg}")
73
+ end
74
+ end
75
+
76
+ # def cdata_block string
77
+ # puts "cdata_block #{string}"
78
+ # end
79
+
80
+ # def processing_instruction name, content
81
+ # puts "processing_instruction #{name}, #{content}"
82
+ # end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,180 @@
1
+ module Eiwa
2
+ module Jmdict
3
+ ENTITIES = {
4
+ "Buddh" => "Buddhist term",
5
+ "MA" => "martial arts term",
6
+ "Shinto" => "Shinto term",
7
+ "X" => "rude or X-rated term (not displayed in educational software)",
8
+ "abbr" => "abbreviation",
9
+ "adj-f" => "noun or verb acting prenominally",
10
+ "adj-i" => "adjective (keiyoushi)",
11
+ "adj-ix" => "adjective (keiyoushi) - yoi/ii class",
12
+ "adj-kari" => "`kari' adjective (archaic)",
13
+ "adj-ku" => "`ku' adjective (archaic)",
14
+ "adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
15
+ "adj-nari" => "archaic/formal form of na-adjective",
16
+ "adj-no" => "nouns which may take the genitive case particle `no'",
17
+ "adj-pn" => "pre-noun adjectival (rentaishi)",
18
+ "adj-shiku" => "`shiku' adjective (archaic)",
19
+ "adj-t" => "`taru' adjective",
20
+ "adv" => "adverb (fukushi)",
21
+ "adv-to" => "adverb taking the `to' particle",
22
+ "anat" => "anatomical term",
23
+ "arch" => "archaism",
24
+ "archit" => "architecture term",
25
+ "astron" => "astronomy, etc. term",
26
+ "ateji" => "ateji (phonetic) reading",
27
+ "aux" => "auxiliary",
28
+ "aux-adj" => "auxiliary adjective",
29
+ "aux-v" => "auxiliary verb",
30
+ "baseb" => "baseball term",
31
+ "biol" => "biology term",
32
+ "bot" => "botany term",
33
+ "bus" => "business term",
34
+ "chem" => "chemistry term",
35
+ "chn" => "children's language",
36
+ "col" => "colloquialism",
37
+ "comp" => "computer terminology",
38
+ "conj" => "conjunction",
39
+ "cop" => "copula",
40
+ "cop-da" => "copula",
41
+ "ctr" => "counter",
42
+ "derog" => "derogatory",
43
+ "eK" => "exclusively kanji",
44
+ "econ" => "economics term",
45
+ "ek" => "exclusively kana",
46
+ "engr" => "engineering term",
47
+ "exp" => "expressions (phrases, clauses, etc.)",
48
+ "fam" => "familiar language",
49
+ "fem" => "female term or language",
50
+ "finc" => "finance term",
51
+ "food" => "food term",
52
+ "geol" => "geology, etc. term",
53
+ "geom" => "geometry term",
54
+ "gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
55
+ "hob" => "Hokkaido-ben",
56
+ "hon" => "honorific or respectful (sonkeigo) language",
57
+ "hum" => "humble (kenjougo) language",
58
+ "iK" => "word containing irregular kanji usage",
59
+ "id" => "idiomatic expression",
60
+ "ik" => "word containing irregular kana usage",
61
+ "int" => "interjection (kandoushi)",
62
+ "io" => "irregular okurigana usage",
63
+ "iv" => "irregular verb",
64
+ "joc" => "jocular, humorous term",
65
+ "ksb" => "Kansai-ben",
66
+ "ktb" => "Kantou-ben",
67
+ "kyb" => "Kyoto-ben",
68
+ "kyu" => "Kyuushuu-ben",
69
+ "law" => "law, etc. term",
70
+ "ling" => "linguistics terminology",
71
+ "m-sl" => "manga slang",
72
+ "mahj" => "mahjong term",
73
+ "male" => "male term or language",
74
+ "male-sl" => "male slang",
75
+ "math" => "mathematics",
76
+ "med" => "medicine, etc. term",
77
+ "mil" => "military",
78
+ "music" => "music term",
79
+ "n" => "noun (common) (futsuumeishi)",
80
+ "n-adv" => "adverbial noun (fukushitekimeishi)",
81
+ "n-pr" => "proper noun",
82
+ "n-pref" => "noun, used as a prefix",
83
+ "n-suf" => "noun, used as a suffix",
84
+ "n-t" => "noun (temporal) (jisoumeishi)",
85
+ "nab" => "Nagano-ben",
86
+ "num" => "numeric",
87
+ "oK" => "word containing out-dated kanji",
88
+ "obs" => "obsolete term",
89
+ "obsc" => "obscure term",
90
+ "oik" => "old or irregular kana form",
91
+ "ok" => "out-dated or obsolete kana usage",
92
+ "on-mim" => "onomatopoeic or mimetic word",
93
+ "osb" => "Osaka-ben",
94
+ "physics" => "physics terminology",
95
+ "pn" => "pronoun",
96
+ "poet" => "poetical term",
97
+ "pol" => "polite (teineigo) language",
98
+ "pref" => "prefix",
99
+ "proverb" => "proverb",
100
+ "prt" => "particle",
101
+ "quote" => "quotation",
102
+ "rare" => "rare",
103
+ "rkb" => "Ryuukyuu-ben",
104
+ "sens" => "sensitive",
105
+ "shogi" => "shogi term",
106
+ "sl" => "slang",
107
+ "sports" => "sports term",
108
+ "suf" => "suffix",
109
+ "sumo" => "sumo term",
110
+ "thb" => "Touhoku-ben",
111
+ "tsb" => "Tosa-ben",
112
+ "tsug" => "Tsugaru-ben",
113
+ "uK" => "word usually written using kanji alone",
114
+ "uk" => "word usually written using kana alone",
115
+ "unc" => "unclassified",
116
+ "v-unspec" => "verb unspecified",
117
+ "v1" => "Ichidan verb",
118
+ "v1-s" => "Ichidan verb - kureru special class",
119
+ "v2a-s" => "Nidan verb with 'u' ending (archaic)",
120
+ "v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
121
+ "v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
122
+ "v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
123
+ "v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
124
+ "v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
125
+ "v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
126
+ "v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
127
+ "v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
128
+ "v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
129
+ "v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
130
+ "v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
131
+ "v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
132
+ "v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
133
+ "v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
134
+ "v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
135
+ "v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
136
+ "v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
137
+ "v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
138
+ "v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
139
+ "v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
140
+ "v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
141
+ "v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
142
+ "v4b" => "Yodan verb with `bu' ending (archaic)",
143
+ "v4g" => "Yodan verb with `gu' ending (archaic)",
144
+ "v4h" => "Yodan verb with `hu/fu' ending (archaic)",
145
+ "v4k" => "Yodan verb with `ku' ending (archaic)",
146
+ "v4m" => "Yodan verb with `mu' ending (archaic)",
147
+ "v4n" => "Yodan verb with `nu' ending (archaic)",
148
+ "v4r" => "Yodan verb with `ru' ending (archaic)",
149
+ "v4s" => "Yodan verb with `su' ending (archaic)",
150
+ "v4t" => "Yodan verb with `tsu' ending (archaic)",
151
+ "v5aru" => "Godan verb - -aru special class",
152
+ "v5b" => "Godan verb with `bu' ending",
153
+ "v5g" => "Godan verb with `gu' ending",
154
+ "v5k" => "Godan verb with `ku' ending",
155
+ "v5k-s" => "Godan verb - Iku/Yuku special class",
156
+ "v5m" => "Godan verb with `mu' ending",
157
+ "v5n" => "Godan verb with `nu' ending",
158
+ "v5r" => "Godan verb with `ru' ending",
159
+ "v5r-i" => "Godan verb with `ru' ending (irregular verb)",
160
+ "v5s" => "Godan verb with `su' ending",
161
+ "v5t" => "Godan verb with `tsu' ending",
162
+ "v5u" => "Godan verb with `u' ending",
163
+ "v5u-s" => "Godan verb with `u' ending (special class)",
164
+ "v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
165
+ "vi" => "intransitive verb",
166
+ "vk" => "Kuru verb - special class",
167
+ "vn" => "irregular nu verb",
168
+ "vr" => "irregular ru verb, plain form ends with -ri",
169
+ "vs" => "noun or participle which takes the aux. verb suru",
170
+ "vs-c" => "su verb - precursor to the modern suru",
171
+ "vs-i" => "suru verb - included",
172
+ "vs-s" => "suru verb - special class",
173
+ "vt" => "transitive verb",
174
+ "vulg" => "vulgar expression or word",
175
+ "vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
176
+ "yoji" => "yojijukugo",
177
+ "zool" => "zoology term"
178
+ }
179
+ end
180
+ end
@@ -0,0 +1,43 @@
1
+ module Eiwa
2
+ module Kanjidic
3
+ TAGS = {
4
+ "character" => Tag::Character,
5
+ "misc" => Tag::Bag,
6
+ "reading_meaning" => Tag::ReadingMeaning,
7
+ "rmgroup" => Tag::List
8
+ }
9
+
10
+ class Doc < Nokogiri::XML::SAX::Document
11
+ def initialize(each_entry_block)
12
+ @each_entry_block = each_entry_block
13
+ @current = nil
14
+ end
15
+
16
+ def start_element(name, attrs)
17
+ parent = @current
18
+ @current = (TAGS[name] || Tag::Other).new
19
+ @current.start(name, attrs, parent)
20
+ end
21
+
22
+ def end_element(name)
23
+ raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
24
+ ending = @current
25
+ ending.end_self
26
+ if ending.is_a?(Tag::Character)
27
+ @each_entry_block&.call(ending)
28
+ end
29
+
30
+ @current = ending.parent
31
+ @current&.end_child(ending)
32
+ end
33
+
34
+ def characters(s)
35
+ @current.add_characters(s)
36
+ end
37
+
38
+ def error(msg)
39
+ raise Eiwa::Error.new("Parsing error: #{msg}")
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,35 @@
1
+ require "nokogiri"
2
+ require_relative "jmdict/doc"
3
+ require_relative "kanjidic/doc"
4
+
5
+ module Eiwa
6
+ class ParsesFile
7
+ def call(filename, type, each_entry_block)
8
+ if each_entry_block.nil?
9
+ entries = []
10
+ each_entry_block ||= ->(e) { entries << e }
11
+ end
12
+
13
+ doc_for(type).new(each_entry_block).tap do |doc|
14
+ Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
15
+ ctx.recovery = true
16
+ end
17
+ end
18
+
19
+ entries
20
+ end
21
+
22
+ private
23
+
24
+ def doc_for(type)
25
+ case type
26
+ when :jmdict_e
27
+ Jmdict::Doc
28
+ when :kanjidic2
29
+ Kanjidic::Doc
30
+ else
31
+ raise Eiwa::Error.new("Unknown file type: #{type}")
32
+ end
33
+ end
34
+ end
35
+ end
@@ -18,10 +18,10 @@ module Eiwa
18
18
  @text == other.text &&
19
19
  @sense_ordinal == other.sense_ordinal
20
20
  end
21
- alias == eql?
21
+ alias_method :==, :eql?
22
22
 
23
23
  def hash
24
- @text.hash + @sense_ordinal.hash
24
+ [@text, @sense_ordinal].hash
25
25
  end
26
26
  end
27
27
  end
data/lib/eiwa/tag/any.rb CHANGED
@@ -5,7 +5,7 @@ module Eiwa
5
5
 
6
6
  def start(tag_name, attrs, parent)
7
7
  @tag_name = tag_name
8
- @attrs = Hash[attrs]
8
+ @attrs = attrs.to_h
9
9
  @parent = parent
10
10
  end
11
11
 
@@ -0,0 +1,21 @@
1
+ module Eiwa
2
+ module Tag
3
+ # For simple elements that contain child element_name, value pairs that could plop into a hash nicely
4
+ class Bag < Any
5
+ attr_reader :values
6
+
7
+ def initialize
8
+ @values = {}
9
+ end
10
+
11
+ def [](key)
12
+ @values[key]
13
+ end
14
+
15
+ def end_child(child)
16
+ # Don't overwrite, first dupe tends to be authorative one
17
+ @values[child.tag_name] = child.text unless @values.key?(child.tag_name)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,24 @@
1
+ module Eiwa
2
+ module Tag
3
+ class Character < Any
4
+ attr_reader :text,
5
+ :grade, :stroke_count, :freq, :jlpt,
6
+ :onyomi, :kunyomi, :meanings
7
+
8
+ def end_child(child)
9
+ if child.tag_name == "literal"
10
+ @text = child.text
11
+ elsif child.tag_name == "reading_meaning"
12
+ @onyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_on" }.map(&:text)
13
+ @kunyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_kun" }.map(&:text)
14
+ @meanings = child.rmgroup.items.select { |item| item.name == "meaning" && (item.attrs["m_lang"].nil? || item.attrs["m_lang"] == "en") }.map(&:text)
15
+ elsif child.tag_name == "misc"
16
+ @grade = child["grade"]&.to_i
17
+ @stroke_count = child["stroke_count"]&.to_i
18
+ @freq = child["freq"]&.to_i
19
+ @jlpt = child["jlpt"]&.to_i
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end