eiwa 0.0.2 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c5888e4802408cc8efdb55ddadfb560ec38d10971fee95b20dd53af2f31f487c
4
- data.tar.gz: 93b7101b430ee123a905065f87e5d8ba336e1a18f145c7801daeb2b9b9a5ba72
3
+ metadata.gz: 87f19acddb018cdf9b99c46b8cd03b38c7ae13d2d006fe31938cf76ebba9da87
4
+ data.tar.gz: e1049fd3df59e89d3b45998a8bf7fbd0620f29a4ba5908a28df33bf72cbd1bc2
5
5
  SHA512:
6
- metadata.gz: 64faccd9958b9c359fcd7a7ff40de013bd1060bbf9a59735be4d52c824dd3e6e77abbb81ab42ae037b9175990dd15baa33722503b47d8aaebbb037a8a303f965
7
- data.tar.gz: 209efe931acfa8563ea1819f4e9b5a7d07a996d1545458b139de14f82971b27f47e0da726ed5151b3922aa05f6c5809bd3196269e0f6d254cb87e801c81ffe6e
6
+ metadata.gz: 5900ba9dd6094ca0b376f1a7067df65ddd289b6ef74c9b111ed44a28ac07daf7c32c6b92d898b85fc93278718e1e10fb674a90ae51df3b3303a80d4bf6365de2
7
+ data.tar.gz: ced4f6719aab797bbb5b1ba251f285751ae15982f36f5c296ebf3e6b607feff0350109a284cc8aee768c2cd248d7aa1f81227567ff549961bc4e37ec4193131b
@@ -9,12 +9,9 @@ jobs:
9
9
 
10
10
  steps:
11
11
  - uses: actions/checkout@v1
12
- - name: Set up Ruby 2.6
13
- uses: actions/setup-ruby@v1
12
+ - uses: ruby/setup-ruby@v1
14
13
  with:
15
- ruby-version: 2.6.x
16
- - name: Build and test with Rake
17
- run: |
18
- gem install bundler
19
- bundle install --jobs 4 --retry 3
20
- bundle exec rake
14
+ ruby-version: '3.3'
15
+ bundler-cache: true
16
+ - name: Run tests
17
+ run: bundle exec rake
data/.standard.yml ADDED
@@ -0,0 +1 @@
1
+ ruby_version: 3.0
data/Gemfile CHANGED
@@ -2,5 +2,9 @@ source "https://rubygems.org"
2
2
 
3
3
  git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
4
 
5
- # Specify your gem's dependencies in eiwa.gemspec
6
5
  gemspec
6
+
7
+ gem "standard"
8
+ gem "minitest"
9
+ gem "rake"
10
+ gem "m"
data/Gemfile.lock CHANGED
@@ -1,53 +1,79 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- eiwa (0.0.2)
5
- nokogiri
4
+ eiwa (0.1.1)
5
+ nokogiri (~> 1.15.5)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
- ast (2.4.0)
11
- coderay (1.1.2)
12
- jaro_winkler (1.5.3)
13
- method_source (0.9.2)
14
- mini_portile2 (2.4.0)
15
- minitest (5.11.3)
16
- nokogiri (1.10.9)
17
- mini_portile2 (~> 2.4.0)
18
- parallel (1.17.0)
19
- parser (2.6.4.1)
20
- ast (~> 2.4.0)
21
- pry (0.12.2)
22
- coderay (~> 1.1.0)
23
- method_source (~> 0.9.0)
24
- rainbow (3.0.0)
25
- rake (13.0.1)
26
- rubocop (0.72.0)
27
- jaro_winkler (~> 1.5.1)
10
+ ast (2.4.2)
11
+ json (2.7.1)
12
+ language_server-protocol (3.17.0.3)
13
+ lint_roller (1.1.0)
14
+ m (1.6.2)
15
+ method_source (>= 0.6.7)
16
+ rake (>= 0.9.2.2)
17
+ method_source (1.0.0)
18
+ mini_portile2 (2.8.5)
19
+ minitest (5.22.2)
20
+ nokogiri (1.15.5)
21
+ mini_portile2 (~> 2.8.2)
22
+ racc (~> 1.4)
23
+ parallel (1.24.0)
24
+ parser (3.3.0.5)
25
+ ast (~> 2.4.1)
26
+ racc
27
+ racc (1.7.3)
28
+ rainbow (3.1.1)
29
+ rake (13.1.0)
30
+ regexp_parser (2.9.0)
31
+ rexml (3.2.6)
32
+ rubocop (1.62.1)
33
+ json (~> 2.3)
34
+ language_server-protocol (>= 3.17.0)
28
35
  parallel (~> 1.10)
29
- parser (>= 2.6)
36
+ parser (>= 3.3.0.2)
30
37
  rainbow (>= 2.2.2, < 4.0)
38
+ regexp_parser (>= 1.8, < 3.0)
39
+ rexml (>= 3.2.5, < 4.0)
40
+ rubocop-ast (>= 1.31.1, < 2.0)
31
41
  ruby-progressbar (~> 1.7)
32
- unicode-display_width (>= 1.4.0, < 1.7)
33
- rubocop-performance (1.4.1)
34
- rubocop (>= 0.71.0)
35
- ruby-progressbar (1.10.1)
36
- standard (0.1.4)
37
- rubocop (~> 0.72.0)
38
- rubocop-performance (~> 1.4.0)
39
- unicode-display_width (1.6.0)
42
+ unicode-display_width (>= 2.4.0, < 3.0)
43
+ rubocop-ast (1.31.2)
44
+ parser (>= 3.3.0.4)
45
+ rubocop-performance (1.20.2)
46
+ rubocop (>= 1.48.1, < 2.0)
47
+ rubocop-ast (>= 1.30.0, < 2.0)
48
+ ruby-progressbar (1.13.0)
49
+ standard (1.34.0)
50
+ language_server-protocol (~> 3.17.0.2)
51
+ lint_roller (~> 1.0)
52
+ rubocop (~> 1.60)
53
+ standard-custom (~> 1.0.0)
54
+ standard-performance (~> 1.3)
55
+ standard-custom (1.0.2)
56
+ lint_roller (~> 1.0)
57
+ rubocop (~> 1.50)
58
+ standard-performance (1.3.1)
59
+ lint_roller (~> 1.1)
60
+ rubocop-performance (~> 1.20.2)
61
+ unicode-display_width (2.5.0)
40
62
 
41
63
  PLATFORMS
42
- ruby
64
+ aarch64-linux
65
+ arm-linux
66
+ arm64-darwin
67
+ x86-linux
68
+ x86_64-darwin
69
+ x86_64-linux
43
70
 
44
71
  DEPENDENCIES
45
- bundler (~> 1.17)
46
72
  eiwa!
47
- minitest (~> 5.0)
48
- pry
49
- rake (~> 13.0)
73
+ m
74
+ minitest
75
+ rake
50
76
  standard
51
77
 
52
78
  BUNDLED WITH
53
- 1.17.3
79
+ 2.5.4
data/README.md CHANGED
@@ -1,7 +1,12 @@
1
1
  # eiwa / 英和
2
2
 
3
- Parses the Japanese-English version of JMDict, a daily export of the WWWJDIC
4
- online Japanese dictionary.
3
+ Parses two types of Japanese-English dictionaries:
4
+
5
+ * `:jmdict_e` - [JMDict](http://www.edrdg.org/jmdict/edict_doc.html)'s
6
+ English-only export of the WWWJDIC online Japanese dictionary.
7
+ * `:kanjidic2` - the
8
+ [KANJIDIC2](http://www.edrdg.org/wiki/index.php/KANJIDIC_Project) dictionary
9
+ of roughly 13,000 kanji characters
5
10
 
6
11
  ## Usage
7
12
 
@@ -23,15 +28,24 @@ gem 'eiwa'
23
28
 
24
29
  Get your hands on a supported dictionary. Right now eiwa only parses
25
30
  [JMDict](http://www.edrdg.org/jmdict/j_jmdict.html), which can be fetched from
26
- the [Monash ftp site](http://ftp.monash.edu/pub/nihongo/00INDEX.html) or with a
31
+ the [EDRDG ftp site](http://ftp.edrdg.org/pub/Nihongo/00INDEX.html) or with a
27
32
  script like this, for the Japanese-English export:
28
33
 
29
34
  ```bash
30
- curl http://ftp.monash.edu/pub/nihongo/JMdict_e -o jmdict.xml
35
+ # Download JMDICT-E:
36
+ $ curl http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz -o jmdict.xml.gz"
37
+ # Unzip to jmdict.xml
38
+ $ gunzip jmdict.xml.gz
39
+
40
+ # Download KANJIDIC2:
41
+ $ curl http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -o kanjidic2.xml.gz
42
+ # Unzip to kanjidic2.xml
43
+ $ gunzip kanjidic2.xml.gz
31
44
  ```
32
45
 
33
- This file is updated daily, and is essentially an export of all vocabulary on
34
- the [WWWJDIC application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
46
+ These files are updated daily, and are essentially an export of all vocabulary
47
+ and kanji in the [WWWJDIC
48
+ application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
35
49
 
36
50
  ### Parse the dictionary
37
51
 
@@ -44,13 +58,11 @@ array and one that will invoke a provided block with each entry, but which won't
44
58
  retain a reference to the entries, allowing Ruby to garbage collect them as it
45
59
  goes.
46
60
 
47
- Parsing the dictionary is CPU intensive, and takes about 13 seconds on my 2019
48
- 13" MacBook Pro.
49
-
50
61
  #### Passing a block
51
62
 
52
63
  If you just want to do some processing on each entry, it probably makes sense to
53
- invoke the library by passing a block
64
+ invoke the library by passing a block (note that supported types include only
65
+ `:jmdict_e` and `:kanjidic2`)
54
66
 
55
67
  ```ruby
56
68
  Eiwa.parse_file("path/to/some.xml", type: :jmdict_e) do |entry|
@@ -74,6 +86,3 @@ entries = Eiwa.parse_file("path/to/some.xml", type: :jmdict_e)
74
86
  Note that for the abridged Japanese-English dictionary, this will consume about
75
87
  500MB of RAM.
76
88
 
77
- ### The entry object model
78
-
79
- I haven't documented the [Entry](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag/entry.rb) type or its child types yet, but they should be pretty easy to piece together by inspecting the output and [checking the source listings](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag).
data/eiwa.gemspec CHANGED
@@ -19,10 +19,5 @@ Gem::Specification.new do |spec|
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
20
  spec.require_paths = ["lib"]
21
21
 
22
- spec.add_dependency "nokogiri"
23
- spec.add_development_dependency "bundler", "~> 1.17"
24
- spec.add_development_dependency "rake", "~> 13.0"
25
- spec.add_development_dependency "minitest", "~> 5.0"
26
- spec.add_development_dependency "standard"
27
- spec.add_development_dependency "pry"
22
+ spec.add_dependency "nokogiri", "~> 1.15.5"
28
23
  end
@@ -0,0 +1,85 @@
1
+ require_relative "entities"
2
+
3
+ module Eiwa
4
+ module Jmdict
5
+ TAGS = {
6
+ "entry" => Tag::Entry,
7
+ "k_ele" => Tag::Spelling,
8
+ "r_ele" => Tag::Reading,
9
+ "sense" => Tag::Meaning,
10
+ "pos" => Tag::Entity,
11
+ "misc" => Tag::Entity,
12
+ "dial" => Tag::Entity,
13
+ "field" => Tag::Entity,
14
+ "ke_inf" => Tag::Entity,
15
+ "re_inf" => Tag::Entity,
16
+ "xref" => Tag::CrossReference,
17
+ "ant" => Tag::Antonym,
18
+ "lsource" => Tag::SourceLanguage,
19
+ "gloss" => Tag::Definition
20
+ }
21
+
22
+ class Doc < Nokogiri::XML::SAX::Document
23
+ def initialize(each_entry_block)
24
+ @each_entry_block = each_entry_block
25
+ @current = nil
26
+ end
27
+
28
+ def start_document
29
+ end
30
+
31
+ def end_document
32
+ end
33
+
34
+ def start_element(name, attrs)
35
+ parent = @current
36
+ @current = (TAGS[name] || Tag::Other).new
37
+ @current.start(name, attrs, parent)
38
+ end
39
+
40
+ def end_element(name)
41
+ raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
42
+ ending = @current
43
+ ending.end_self
44
+ if ending.is_a?(Tag::Entry)
45
+ @each_entry_block&.call(ending)
46
+ end
47
+
48
+ @current = ending.parent
49
+ @current&.end_child(ending)
50
+ end
51
+
52
+ def characters(s)
53
+ @current.add_characters(s)
54
+ end
55
+
56
+ # def comment string
57
+ # puts "comment #{string}"
58
+ # end
59
+
60
+ # def warning string
61
+ # puts "warning #{string}"
62
+ # end
63
+
64
+ def error(msg)
65
+ if (matches = msg.match(/Entity '(\S+)' not defined/))
66
+ # See: http://github.com/sparklemotion/nokogiri/issues/1926
67
+ code = matches[1]
68
+ @current.set_entity(code, ENTITIES[code])
69
+ elsif msg == "Detected an entity reference loop\n"
70
+ # Do nothing and hope this does not matter.
71
+ else
72
+ raise Eiwa::Error.new("Parsing error: #{msg}")
73
+ end
74
+ end
75
+
76
+ # def cdata_block string
77
+ # puts "cdata_block #{string}"
78
+ # end
79
+
80
+ # def processing_instruction name, content
81
+ # puts "processing_instruction #{name}, #{content}"
82
+ # end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,180 @@
1
+ module Eiwa
2
+ module Jmdict
3
+ ENTITIES = {
4
+ "Buddh" => "Buddhist term",
5
+ "MA" => "martial arts term",
6
+ "Shinto" => "Shinto term",
7
+ "X" => "rude or X-rated term (not displayed in educational software)",
8
+ "abbr" => "abbreviation",
9
+ "adj-f" => "noun or verb acting prenominally",
10
+ "adj-i" => "adjective (keiyoushi)",
11
+ "adj-ix" => "adjective (keiyoushi) - yoi/ii class",
12
+ "adj-kari" => "`kari' adjective (archaic)",
13
+ "adj-ku" => "`ku' adjective (archaic)",
14
+ "adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
15
+ "adj-nari" => "archaic/formal form of na-adjective",
16
+ "adj-no" => "nouns which may take the genitive case particle `no'",
17
+ "adj-pn" => "pre-noun adjectival (rentaishi)",
18
+ "adj-shiku" => "`shiku' adjective (archaic)",
19
+ "adj-t" => "`taru' adjective",
20
+ "adv" => "adverb (fukushi)",
21
+ "adv-to" => "adverb taking the `to' particle",
22
+ "anat" => "anatomical term",
23
+ "arch" => "archaism",
24
+ "archit" => "architecture term",
25
+ "astron" => "astronomy, etc. term",
26
+ "ateji" => "ateji (phonetic) reading",
27
+ "aux" => "auxiliary",
28
+ "aux-adj" => "auxiliary adjective",
29
+ "aux-v" => "auxiliary verb",
30
+ "baseb" => "baseball term",
31
+ "biol" => "biology term",
32
+ "bot" => "botany term",
33
+ "bus" => "business term",
34
+ "chem" => "chemistry term",
35
+ "chn" => "children's language",
36
+ "col" => "colloquialism",
37
+ "comp" => "computer terminology",
38
+ "conj" => "conjunction",
39
+ "cop" => "copula",
40
+ "cop-da" => "copula",
41
+ "ctr" => "counter",
42
+ "derog" => "derogatory",
43
+ "eK" => "exclusively kanji",
44
+ "econ" => "economics term",
45
+ "ek" => "exclusively kana",
46
+ "engr" => "engineering term",
47
+ "exp" => "expressions (phrases, clauses, etc.)",
48
+ "fam" => "familiar language",
49
+ "fem" => "female term or language",
50
+ "finc" => "finance term",
51
+ "food" => "food term",
52
+ "geol" => "geology, etc. term",
53
+ "geom" => "geometry term",
54
+ "gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
55
+ "hob" => "Hokkaido-ben",
56
+ "hon" => "honorific or respectful (sonkeigo) language",
57
+ "hum" => "humble (kenjougo) language",
58
+ "iK" => "word containing irregular kanji usage",
59
+ "id" => "idiomatic expression",
60
+ "ik" => "word containing irregular kana usage",
61
+ "int" => "interjection (kandoushi)",
62
+ "io" => "irregular okurigana usage",
63
+ "iv" => "irregular verb",
64
+ "joc" => "jocular, humorous term",
65
+ "ksb" => "Kansai-ben",
66
+ "ktb" => "Kantou-ben",
67
+ "kyb" => "Kyoto-ben",
68
+ "kyu" => "Kyuushuu-ben",
69
+ "law" => "law, etc. term",
70
+ "ling" => "linguistics terminology",
71
+ "m-sl" => "manga slang",
72
+ "mahj" => "mahjong term",
73
+ "male" => "male term or language",
74
+ "male-sl" => "male slang",
75
+ "math" => "mathematics",
76
+ "med" => "medicine, etc. term",
77
+ "mil" => "military",
78
+ "music" => "music term",
79
+ "n" => "noun (common) (futsuumeishi)",
80
+ "n-adv" => "adverbial noun (fukushitekimeishi)",
81
+ "n-pr" => "proper noun",
82
+ "n-pref" => "noun, used as a prefix",
83
+ "n-suf" => "noun, used as a suffix",
84
+ "n-t" => "noun (temporal) (jisoumeishi)",
85
+ "nab" => "Nagano-ben",
86
+ "num" => "numeric",
87
+ "oK" => "word containing out-dated kanji",
88
+ "obs" => "obsolete term",
89
+ "obsc" => "obscure term",
90
+ "oik" => "old or irregular kana form",
91
+ "ok" => "out-dated or obsolete kana usage",
92
+ "on-mim" => "onomatopoeic or mimetic word",
93
+ "osb" => "Osaka-ben",
94
+ "physics" => "physics terminology",
95
+ "pn" => "pronoun",
96
+ "poet" => "poetical term",
97
+ "pol" => "polite (teineigo) language",
98
+ "pref" => "prefix",
99
+ "proverb" => "proverb",
100
+ "prt" => "particle",
101
+ "quote" => "quotation",
102
+ "rare" => "rare",
103
+ "rkb" => "Ryuukyuu-ben",
104
+ "sens" => "sensitive",
105
+ "shogi" => "shogi term",
106
+ "sl" => "slang",
107
+ "sports" => "sports term",
108
+ "suf" => "suffix",
109
+ "sumo" => "sumo term",
110
+ "thb" => "Touhoku-ben",
111
+ "tsb" => "Tosa-ben",
112
+ "tsug" => "Tsugaru-ben",
113
+ "uK" => "word usually written using kanji alone",
114
+ "uk" => "word usually written using kana alone",
115
+ "unc" => "unclassified",
116
+ "v-unspec" => "verb unspecified",
117
+ "v1" => "Ichidan verb",
118
+ "v1-s" => "Ichidan verb - kureru special class",
119
+ "v2a-s" => "Nidan verb with 'u' ending (archaic)",
120
+ "v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
121
+ "v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
122
+ "v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
123
+ "v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
124
+ "v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
125
+ "v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
126
+ "v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
127
+ "v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
128
+ "v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
129
+ "v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
130
+ "v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
131
+ "v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
132
+ "v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
133
+ "v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
134
+ "v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
135
+ "v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
136
+ "v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
137
+ "v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
138
+ "v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
139
+ "v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
140
+ "v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
141
+ "v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
142
+ "v4b" => "Yodan verb with `bu' ending (archaic)",
143
+ "v4g" => "Yodan verb with `gu' ending (archaic)",
144
+ "v4h" => "Yodan verb with `hu/fu' ending (archaic)",
145
+ "v4k" => "Yodan verb with `ku' ending (archaic)",
146
+ "v4m" => "Yodan verb with `mu' ending (archaic)",
147
+ "v4n" => "Yodan verb with `nu' ending (archaic)",
148
+ "v4r" => "Yodan verb with `ru' ending (archaic)",
149
+ "v4s" => "Yodan verb with `su' ending (archaic)",
150
+ "v4t" => "Yodan verb with `tsu' ending (archaic)",
151
+ "v5aru" => "Godan verb - -aru special class",
152
+ "v5b" => "Godan verb with `bu' ending",
153
+ "v5g" => "Godan verb with `gu' ending",
154
+ "v5k" => "Godan verb with `ku' ending",
155
+ "v5k-s" => "Godan verb - Iku/Yuku special class",
156
+ "v5m" => "Godan verb with `mu' ending",
157
+ "v5n" => "Godan verb with `nu' ending",
158
+ "v5r" => "Godan verb with `ru' ending",
159
+ "v5r-i" => "Godan verb with `ru' ending (irregular verb)",
160
+ "v5s" => "Godan verb with `su' ending",
161
+ "v5t" => "Godan verb with `tsu' ending",
162
+ "v5u" => "Godan verb with `u' ending",
163
+ "v5u-s" => "Godan verb with `u' ending (special class)",
164
+ "v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
165
+ "vi" => "intransitive verb",
166
+ "vk" => "Kuru verb - special class",
167
+ "vn" => "irregular nu verb",
168
+ "vr" => "irregular ru verb, plain form ends with -ri",
169
+ "vs" => "noun or participle which takes the aux. verb suru",
170
+ "vs-c" => "su verb - precursor to the modern suru",
171
+ "vs-i" => "suru verb - included",
172
+ "vs-s" => "suru verb - special class",
173
+ "vt" => "transitive verb",
174
+ "vulg" => "vulgar expression or word",
175
+ "vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
176
+ "yoji" => "yojijukugo",
177
+ "zool" => "zoology term"
178
+ }
179
+ end
180
+ end
@@ -0,0 +1,43 @@
1
+ module Eiwa
2
+ module Kanjidic
3
+ TAGS = {
4
+ "character" => Tag::Character,
5
+ "misc" => Tag::Bag,
6
+ "reading_meaning" => Tag::ReadingMeaning,
7
+ "rmgroup" => Tag::List
8
+ }
9
+
10
+ class Doc < Nokogiri::XML::SAX::Document
11
+ def initialize(each_entry_block)
12
+ @each_entry_block = each_entry_block
13
+ @current = nil
14
+ end
15
+
16
+ def start_element(name, attrs)
17
+ parent = @current
18
+ @current = (TAGS[name] || Tag::Other).new
19
+ @current.start(name, attrs, parent)
20
+ end
21
+
22
+ def end_element(name)
23
+ raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
24
+ ending = @current
25
+ ending.end_self
26
+ if ending.is_a?(Tag::Character)
27
+ @each_entry_block&.call(ending)
28
+ end
29
+
30
+ @current = ending.parent
31
+ @current&.end_child(ending)
32
+ end
33
+
34
+ def characters(s)
35
+ @current.add_characters(s)
36
+ end
37
+
38
+ def error(msg)
39
+ raise Eiwa::Error.new("Parsing error: #{msg}")
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,35 @@
1
+ require "nokogiri"
2
+ require_relative "jmdict/doc"
3
+ require_relative "kanjidic/doc"
4
+
5
+ module Eiwa
6
+ class ParsesFile
7
+ def call(filename, type, each_entry_block)
8
+ if each_entry_block.nil?
9
+ entries = []
10
+ each_entry_block ||= ->(e) { entries << e }
11
+ end
12
+
13
+ doc_for(type).new(each_entry_block).tap do |doc|
14
+ Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
15
+ ctx.recovery = true
16
+ end
17
+ end
18
+
19
+ entries
20
+ end
21
+
22
+ private
23
+
24
+ def doc_for(type)
25
+ case type
26
+ when :jmdict_e
27
+ Jmdict::Doc
28
+ when :kanjidic2
29
+ Kanjidic::Doc
30
+ else
31
+ raise Eiwa::Error.new("Unknown file type: #{type}")
32
+ end
33
+ end
34
+ end
35
+ end
@@ -18,10 +18,10 @@ module Eiwa
18
18
  @text == other.text &&
19
19
  @sense_ordinal == other.sense_ordinal
20
20
  end
21
- alias == eql?
21
+ alias_method :==, :eql?
22
22
 
23
23
  def hash
24
- @text.hash + @sense_ordinal.hash
24
+ [@text, @sense_ordinal].hash
25
25
  end
26
26
  end
27
27
  end
data/lib/eiwa/tag/any.rb CHANGED
@@ -5,7 +5,7 @@ module Eiwa
5
5
 
6
6
  def start(tag_name, attrs, parent)
7
7
  @tag_name = tag_name
8
- @attrs = Hash[attrs]
8
+ @attrs = attrs.to_h
9
9
  @parent = parent
10
10
  end
11
11
 
@@ -0,0 +1,21 @@
1
+ module Eiwa
2
+ module Tag
3
+ # For simple elements that contain child element_name, value pairs that could plop into a hash nicely
4
+ class Bag < Any
5
+ attr_reader :values
6
+
7
+ def initialize
8
+ @values = {}
9
+ end
10
+
11
+ def [](key)
12
+ @values[key]
13
+ end
14
+
15
+ def end_child(child)
16
+ # Don't overwrite, first dupe tends to be authorative one
17
+ @values[child.tag_name] = child.text unless @values.key?(child.tag_name)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,24 @@
1
+ module Eiwa
2
+ module Tag
3
+ class Character < Any
4
+ attr_reader :text,
5
+ :grade, :stroke_count, :freq, :jlpt,
6
+ :onyomi, :kunyomi, :meanings
7
+
8
+ def end_child(child)
9
+ if child.tag_name == "literal"
10
+ @text = child.text
11
+ elsif child.tag_name == "reading_meaning"
12
+ @onyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_on" }.map(&:text)
13
+ @kunyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_kun" }.map(&:text)
14
+ @meanings = child.rmgroup.items.select { |item| item.name == "meaning" && (item.attrs["m_lang"].nil? || item.attrs["m_lang"] == "en") }.map(&:text)
15
+ elsif child.tag_name == "misc"
16
+ @grade = child["grade"]&.to_i
17
+ @stroke_count = child["stroke_count"]&.to_i
18
+ @freq = child["freq"]&.to_i
19
+ @jlpt = child["jlpt"]&.to_i
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end