RubyGems - eiwa - Versions diffs - 0.0.2 → 0.1.0 - Mend

eiwa 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/Gemfile.lock +35 -27
data/README.md +22 -13
data/lib/eiwa.rb +19 -7
data/lib/eiwa/jmdict/doc.rb +85 -0
data/lib/eiwa/jmdict/entities.rb +180 -0
data/lib/eiwa/kanjidic/doc.rb +43 -0
data/lib/eiwa/parses_file.rb +35 -0
data/lib/eiwa/tag/antonym.rb +1 -1
data/lib/eiwa/tag/bag.rb +21 -0
data/lib/eiwa/tag/character.rb +24 -0
data/lib/eiwa/tag/cross_reference.rb +1 -1
data/lib/eiwa/tag/definition.rb +1 -1
data/lib/eiwa/tag/entity.rb +1 -3
data/lib/eiwa/tag/entry.rb +0 -2
data/lib/eiwa/tag/list.rb +18 -0
data/lib/eiwa/tag/meaning.rb +0 -2
data/lib/eiwa/tag/other.rb +5 -3
data/lib/eiwa/tag/reading.rb +0 -2
data/lib/eiwa/tag/reading_meaning.rb +11 -0
data/lib/eiwa/tag/source_language.rb +1 -1
data/lib/eiwa/tag/spelling.rb +0 -2
data/lib/eiwa/version.rb +1 -1
metadata +15 -10
data/lib/eiwa/jmdict_doc.rb +0 -93
data/lib/eiwa/jmdict_entities.rb +0 -178
data/lib/eiwa/parses_jmdict_file.rb +0 -21

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c5888e4802408cc8efdb55ddadfb560ec38d10971fee95b20dd53af2f31f487c
-  data.tar.gz: 93b7101b430ee123a905065f87e5d8ba336e1a18f145c7801daeb2b9b9a5ba72
+  metadata.gz: ef277fcf117e28dcbc32fc6e17c8d34f1783b546f9adf5b9452e04d073d31ee8
+  data.tar.gz: d1e744a10c6e688e532da9855790ba1a9fad89338b75f0924ee9a5f90294e3ee
 SHA512:
-  metadata.gz: 64faccd9958b9c359fcd7a7ff40de013bd1060bbf9a59735be4d52c824dd3e6e77abbb81ab42ae037b9175990dd15baa33722503b47d8aaebbb037a8a303f965
-  data.tar.gz: 209efe931acfa8563ea1819f4e9b5a7d07a996d1545458b139de14f82971b27f47e0da726ed5151b3922aa05f6c5809bd3196269e0f6d254cb87e801c81ffe6e
+  metadata.gz: '090773b16ffc636c53cbd957e5f6131449c455bf4e00c39908f78403c2c284bd7364b9e33ea9f6d8ac9a930777cfdc1f67e296d701ee7224fd6572da812f5f93'
+  data.tar.gz: d43f3a9ded86a0a3238eaac28c86b1d3d7b95dbe3d7a7c6eb8fe389014c7393600009f0c79f69cf7b06ff8dcd84650c0fa679524703cd36207b795b994892375

data/Gemfile.lock CHANGED

@@ -1,42 +1,50 @@
 PATH
   remote: .
   specs:
-    eiwa (0.0.2)
+    eiwa (0.1.0)
       nokogiri
 GEM
   remote: https://rubygems.org/
   specs:
-    ast (2.4.0)
-    coderay (1.1.2)
-    jaro_winkler (1.5.3)
-    method_source (0.9.2)
-    mini_portile2 (2.4.0)
-    minitest (5.11.3)
-    nokogiri (1.10.9)
-      mini_portile2 (~> 2.4.0)
-    parallel (1.17.0)
-    parser (2.6.4.1)
-      ast (~> 2.4.0)
-    pry (0.12.2)
-      coderay (~> 1.1.0)
-      method_source (~> 0.9.0)
+    ast (2.4.1)
+    coderay (1.1.3)
+    method_source (1.0.0)
+    mini_portile2 (2.5.0)
+    minitest (5.14.3)
+    nokogiri (1.11.1)
+      mini_portile2 (~> 2.5.0)
+      racc (~> 1.4)
+    parallel (1.20.1)
+    parser (3.0.0.0)
+      ast (~> 2.4.1)
+    pry (0.13.1)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
+    racc (1.5.2)
     rainbow (3.0.0)
-    rake (13.0.1)
-    rubocop (0.72.0)
-      jaro_winkler (~> 1.5.1)
+    rake (13.0.3)
+    regexp_parser (2.0.3)
+    rexml (3.2.4)
+    rubocop (1.7.0)
       parallel (~> 1.10)
-      parser (>= 2.6)
+      parser (>= 2.7.1.5)
       rainbow (>= 2.2.2, < 4.0)
+      regexp_parser (>= 1.8, < 3.0)
+      rexml
+      rubocop-ast (>= 1.2.0, < 2.0)
       ruby-progressbar (~> 1.7)
-      unicode-display_width (>= 1.4.0, < 1.7)
-    rubocop-performance (1.4.1)
-      rubocop (>= 0.71.0)
-    ruby-progressbar (1.10.1)
-    standard (0.1.4)
-      rubocop (~> 0.72.0)
-      rubocop-performance (~> 1.4.0)
-    unicode-display_width (1.6.0)
+      unicode-display_width (>= 1.4.0, < 2.0)
+    rubocop-ast (1.4.0)
+      parser (>= 2.7.1.5)
+    rubocop-performance (1.9.2)
+      rubocop (>= 0.90.0, < 2.0)
+      rubocop-ast (>= 0.4.0)
+    ruby-progressbar (1.11.0)
+    standard (0.11.0)
+      rubocop (= 1.7.0)
+      rubocop-performance (= 1.9.2)
+    unicode-display_width (1.7.0)
 PLATFORMS
   ruby

data/README.md CHANGED

@@ -1,7 +1,12 @@
 # eiwa / 英和
-Parses the Japanese-English version of JMDict, a daily export of the WWWJDIC
-online Japanese dictionary.
+Parses two types of Japanese-English dictionaries:
+* `:jmdict_e` - [JMDict](http://www.edrdg.org/jmdict/edict_doc.html)'s
+  English-only export of the WWWJDIC online Japanese dictionary.
+* `:kanjidic2` - the
+  [KANJIDIC2](http://www.edrdg.org/wiki/index.php/KANJIDIC_Project) dictionary
+  of roughly 13,000 kanji characters
 ## Usage
@@ -23,15 +28,24 @@ gem 'eiwa'
 Get your hands on a supported dictionary. Right now eiwa only parses
 [JMDict](http://www.edrdg.org/jmdict/j_jmdict.html), which can be fetched from
-the [Monash ftp site](http://ftp.monash.edu/pub/nihongo/00INDEX.html) or with a
+the [EDRDG ftp site](http://ftp.edrdg.org/pub/Nihongo/00INDEX.html) or with a
 script like this, for the Japanese-English export:
 ```bash
-curl http://ftp.monash.edu/pub/nihongo/JMdict_e -o jmdict.xml
+# Download JMDICT-E:
+$ curl http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz -o jmdict.xml.gz"
+# Unzip to jmdict.xml
+$ gunzip jmdict.xml.gz
+# Download KANJIDIC2:
+$ curl http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -o kanjidic2.xml.gz
+# Unzip to kanjidic2.xml
+$ gunzip kanjidic2.xml.gz
 ```
-This file is updated daily, and is essentially an export of all vocabulary on
-the [WWWJDIC application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
+These files are updated daily, and are essentially an export of all vocabulary
+and kanji in the [WWWJDIC
+application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
 ### Parse the dictionary
@@ -44,13 +58,11 @@ array and one that will invoke a provided block with each entry, but which won't
 retain a reference to the entries, allowing Ruby to garbage collect them as it
 goes.
-Parsing the dictionary is CPU intensive, and takes about 13 seconds on my 2019
-13" MacBook Pro.
 #### Passing a block
 If you just want to do some processing on each entry, it probably makes sense to
-invoke the library by passing a block
+invoke the library by passing a block (note that supported types include only
+`:jmdict_e` and `:kanjidic2`)
 ```ruby
 Eiwa.parse_file("path/to/some.xml", type: :jmdict_e) do |entry|
@@ -74,6 +86,3 @@ entries = Eiwa.parse_file("path/to/some.xml", type: :jmdict_e)
 Note that for the abridged Japanese-English dictionary, this will consume about
 500MB of RAM.
-### The entry object model
-I haven't documented the [Entry](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag/entry.rb) type or its child types yet, but they should be pretty easy to piece together by inspecting the output and [checking the source listings](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag).

data/lib/eiwa.rb CHANGED

@@ -1,15 +1,27 @@
 require "eiwa/version"
-require "eiwa/parses_jmdict_file"
+require "eiwa/tag/any"
+require "eiwa/tag/character"
+require "eiwa/tag/bag"
+require "eiwa/tag/list"
+require "eiwa/tag/reading_meaning"
+require "eiwa/tag/entry"
+require "eiwa/tag/spelling"
+require "eiwa/tag/reading"
+require "eiwa/tag/meaning"
+require "eiwa/tag/entity"
+require "eiwa/tag/cross_reference"
+require "eiwa/tag/antonym"
+require "eiwa/tag/source_language"
+require "eiwa/tag/definition"
+require "eiwa/tag/other"
+require "eiwa/parses_file"
 module Eiwa
   class Error < StandardError; end
   def self.parse_file(filename, type: :jmdict_e, &each_entry_block)
-    case type
-    when :jmdict_e
-      ParsesJmdictFile.new.call(filename, each_entry_block)
-    else
-      raise Eiwa::Error.new("Unknown file type: #{type}")
-    end
+    ParsesFile.new.call(filename, type, each_entry_block)
   end
 end

data/lib/eiwa/jmdict/doc.rb ADDED

@@ -0,0 +1,85 @@
+require_relative "entities"
+module Eiwa
+  module Jmdict
+    TAGS = {
+      "entry" => Tag::Entry,
+      "k_ele" => Tag::Spelling,
+      "r_ele" => Tag::Reading,
+      "sense" => Tag::Meaning,
+      "pos" => Tag::Entity,
+      "misc" => Tag::Entity,
+      "dial" => Tag::Entity,
+      "field" => Tag::Entity,
+      "ke_inf" => Tag::Entity,
+      "re_inf" => Tag::Entity,
+      "xref" => Tag::CrossReference,
+      "ant" => Tag::Antonym,
+      "lsource" => Tag::SourceLanguage,
+      "gloss" => Tag::Definition
+    }
+    class Doc < Nokogiri::XML::SAX::Document
+      def initialize(each_entry_block)
+        @each_entry_block = each_entry_block
+        @current = nil
+      end
+      def start_document
+      end
+      def end_document
+      end
+      def start_element(name, attrs)
+        parent = @current
+        @current = (TAGS[name] || Tag::Other).new
+        @current.start(name, attrs, parent)
+      end
+      def end_element(name)
+        raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
+        ending = @current
+        ending.end_self
+        if ending.is_a?(Tag::Entry)
+          @each_entry_block&.call(ending)
+        end
+        @current = ending.parent
+        @current&.end_child(ending)
+      end
+      def characters(s)
+        @current.add_characters(s)
+      end
+      # def comment string
+      #   puts "comment #{string}"
+      # end
+      # def warning string
+      #   puts "warning #{string}"
+      # end
+      def error(msg)
+        if (matches = msg.match(/Entity '(\S+)' not defined/))
+          # See: http://github.com/sparklemotion/nokogiri/issues/1926
+          code = matches[1]
+          @current.set_entity(code, ENTITIES[code])
+        elsif msg == "Detected an entity reference loop\n"
+          # Do nothing and hope this does not matter.
+        else
+          raise Eiwa::Error.new("Parsing error: #{msg}")
+        end
+      end
+      # def cdata_block string
+      #   puts "cdata_block #{string}"
+      # end
+      # def processing_instruction name, content
+      #   puts "processing_instruction #{name}, #{content}"
+      # end
+    end
+  end
+end

data/lib/eiwa/jmdict/entities.rb ADDED

@@ -0,0 +1,180 @@
+module Eiwa
+  module Jmdict
+    ENTITIES = {
+      "Buddh" => "Buddhist term",
+      "MA" => "martial arts term",
+      "Shinto" => "Shinto term",
+      "X" => "rude or X-rated term (not displayed in educational software)",
+      "abbr" => "abbreviation",
+      "adj-f" => "noun or verb acting prenominally",
+      "adj-i" => "adjective (keiyoushi)",
+      "adj-ix" => "adjective (keiyoushi) - yoi/ii class",
+      "adj-kari" => "`kari' adjective (archaic)",
+      "adj-ku" => "`ku' adjective (archaic)",
+      "adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
+      "adj-nari" => "archaic/formal form of na-adjective",
+      "adj-no" => "nouns which may take the genitive case particle `no'",
+      "adj-pn" => "pre-noun adjectival (rentaishi)",
+      "adj-shiku" => "`shiku' adjective (archaic)",
+      "adj-t" => "`taru' adjective",
+      "adv" => "adverb (fukushi)",
+      "adv-to" => "adverb taking the `to' particle",
+      "anat" => "anatomical term",
+      "arch" => "archaism",
+      "archit" => "architecture term",
+      "astron" => "astronomy, etc. term",
+      "ateji" => "ateji (phonetic) reading",
+      "aux" => "auxiliary",
+      "aux-adj" => "auxiliary adjective",
+      "aux-v" => "auxiliary verb",
+      "baseb" => "baseball term",
+      "biol" => "biology term",
+      "bot" => "botany term",
+      "bus" => "business term",
+      "chem" => "chemistry term",
+      "chn" => "children's language",
+      "col" => "colloquialism",
+      "comp" => "computer terminology",
+      "conj" => "conjunction",
+      "cop" => "copula",
+      "cop-da" => "copula",
+      "ctr" => "counter",
+      "derog" => "derogatory",
+      "eK" => "exclusively kanji",
+      "econ" => "economics term",
+      "ek" => "exclusively kana",
+      "engr" => "engineering term",
+      "exp" => "expressions (phrases, clauses, etc.)",
+      "fam" => "familiar language",
+      "fem" => "female term or language",
+      "finc" => "finance term",
+      "food" => "food term",
+      "geol" => "geology, etc. term",
+      "geom" => "geometry term",
+      "gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
+      "hob" => "Hokkaido-ben",
+      "hon" => "honorific or respectful (sonkeigo) language",
+      "hum" => "humble (kenjougo) language",
+      "iK" => "word containing irregular kanji usage",
+      "id" => "idiomatic expression",
+      "ik" => "word containing irregular kana usage",
+      "int" => "interjection (kandoushi)",
+      "io" => "irregular okurigana usage",
+      "iv" => "irregular verb",
+      "joc" => "jocular, humorous term",
+      "ksb" => "Kansai-ben",
+      "ktb" => "Kantou-ben",
+      "kyb" => "Kyoto-ben",
+      "kyu" => "Kyuushuu-ben",
+      "law" => "law, etc. term",
+      "ling" => "linguistics terminology",
+      "m-sl" => "manga slang",
+      "mahj" => "mahjong term",
+      "male" => "male term or language",
+      "male-sl" => "male slang",
+      "math" => "mathematics",
+      "med" => "medicine, etc. term",
+      "mil" => "military",
+      "music" => "music term",
+      "n" => "noun (common) (futsuumeishi)",
+      "n-adv" => "adverbial noun (fukushitekimeishi)",
+      "n-pr" => "proper noun",
+      "n-pref" => "noun, used as a prefix",
+      "n-suf" => "noun, used as a suffix",
+      "n-t" => "noun (temporal) (jisoumeishi)",
+      "nab" => "Nagano-ben",
+      "num" => "numeric",
+      "oK" => "word containing out-dated kanji",
+      "obs" => "obsolete term",
+      "obsc" => "obscure term",
+      "oik" => "old or irregular kana form",
+      "ok" => "out-dated or obsolete kana usage",
+      "on-mim" => "onomatopoeic or mimetic word",
+      "osb" => "Osaka-ben",
+      "physics" => "physics terminology",
+      "pn" => "pronoun",
+      "poet" => "poetical term",
+      "pol" => "polite (teineigo) language",
+      "pref" => "prefix",
+      "proverb" => "proverb",
+      "prt" => "particle",
+      "quote" => "quotation",
+      "rare" => "rare",
+      "rkb" => "Ryuukyuu-ben",
+      "sens" => "sensitive",
+      "shogi" => "shogi term",
+      "sl" => "slang",
+      "sports" => "sports term",
+      "suf" => "suffix",
+      "sumo" => "sumo term",
+      "thb" => "Touhoku-ben",
+      "tsb" => "Tosa-ben",
+      "tsug" => "Tsugaru-ben",
+      "uK" => "word usually written using kanji alone",
+      "uk" => "word usually written using kana alone",
+      "unc" => "unclassified",
+      "v-unspec" => "verb unspecified",
+      "v1" => "Ichidan verb",
+      "v1-s" => "Ichidan verb - kureru special class",
+      "v2a-s" => "Nidan verb with 'u' ending (archaic)",
+      "v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
+      "v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
+      "v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
+      "v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
+      "v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
+      "v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
+      "v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
+      "v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
+      "v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
+      "v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
+      "v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
+      "v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
+      "v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
+      "v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
+      "v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
+      "v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
+      "v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
+      "v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
+      "v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
+      "v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
+      "v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
+      "v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
+      "v4b" => "Yodan verb with `bu' ending (archaic)",
+      "v4g" => "Yodan verb with `gu' ending (archaic)",
+      "v4h" => "Yodan verb with `hu/fu' ending (archaic)",
+      "v4k" => "Yodan verb with `ku' ending (archaic)",
+      "v4m" => "Yodan verb with `mu' ending (archaic)",
+      "v4n" => "Yodan verb with `nu' ending (archaic)",
+      "v4r" => "Yodan verb with `ru' ending (archaic)",
+      "v4s" => "Yodan verb with `su' ending (archaic)",
+      "v4t" => "Yodan verb with `tsu' ending (archaic)",
+      "v5aru" => "Godan verb - -aru special class",
+      "v5b" => "Godan verb with `bu' ending",
+      "v5g" => "Godan verb with `gu' ending",
+      "v5k" => "Godan verb with `ku' ending",
+      "v5k-s" => "Godan verb - Iku/Yuku special class",
+      "v5m" => "Godan verb with `mu' ending",
+      "v5n" => "Godan verb with `nu' ending",
+      "v5r" => "Godan verb with `ru' ending",
+      "v5r-i" => "Godan verb with `ru' ending (irregular verb)",
+      "v5s" => "Godan verb with `su' ending",
+      "v5t" => "Godan verb with `tsu' ending",
+      "v5u" => "Godan verb with `u' ending",
+      "v5u-s" => "Godan verb with `u' ending (special class)",
+      "v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
+      "vi" => "intransitive verb",
+      "vk" => "Kuru verb - special class",
+      "vn" => "irregular nu verb",
+      "vr" => "irregular ru verb, plain form ends with -ri",
+      "vs" => "noun or participle which takes the aux. verb suru",
+      "vs-c" => "su verb - precursor to the modern suru",
+      "vs-i" => "suru verb - included",
+      "vs-s" => "suru verb - special class",
+      "vt" => "transitive verb",
+      "vulg" => "vulgar expression or word",
+      "vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
+      "yoji" => "yojijukugo",
+      "zool" => "zoology term"
+    }
+  end
+end

data/lib/eiwa/kanjidic/doc.rb ADDED

@@ -0,0 +1,43 @@
+module Eiwa
+  module Kanjidic
+    TAGS = {
+      "character" => Tag::Character,
+      "misc" => Tag::Bag,
+      "reading_meaning" => Tag::ReadingMeaning,
+      "rmgroup" => Tag::List
+    }
+    class Doc < Nokogiri::XML::SAX::Document
+      def initialize(each_entry_block)
+        @each_entry_block = each_entry_block
+        @current = nil
+      end
+      def start_element(name, attrs)
+        parent = @current
+        @current = (TAGS[name] || Tag::Other).new
+        @current.start(name, attrs, parent)
+      end
+      def end_element(name)
+        raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
+        ending = @current
+        ending.end_self
+        if ending.is_a?(Tag::Character)
+          @each_entry_block&.call(ending)
+        end
+        @current = ending.parent
+        @current&.end_child(ending)
+      end
+      def characters(s)
+        @current.add_characters(s)
+      end
+      def error(msg)
+        raise Eiwa::Error.new("Parsing error: #{msg}")
+      end
+    end
+  end
+end

data/lib/eiwa/parses_file.rb ADDED

@@ -0,0 +1,35 @@
+require "nokogiri"
+require_relative "jmdict/doc"
+require_relative "kanjidic/doc"
+module Eiwa
+  class ParsesFile
+    def call(filename, type, each_entry_block)
+      if each_entry_block.nil?
+        entries = []
+        each_entry_block ||= ->(e) { entries << e }
+      end
+      doc_for(type).new(each_entry_block).tap do |doc|
+        Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
+          ctx.recovery = true
+        end
+      end
+      entries
+    end
+    private
+    def doc_for(type)
+      case type
+      when :jmdict_e
+        Jmdict::Doc
+      when :kanjidic2
+        Kanjidic::Doc
+      else
+        raise Eiwa::Error.new("Unknown file type: #{type}")
+      end
+    end
+  end
+end

data/lib/eiwa/tag/antonym.rb CHANGED

@@ -18,7 +18,7 @@ module Eiwa
         @text == other.text &&
           @sense_ordinal == other.sense_ordinal
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
         @text.hash + @sense_ordinal.hash

data/lib/eiwa/tag/bag.rb ADDED

@@ -0,0 +1,21 @@
+module Eiwa
+  module Tag
+    # For simple elements that contain child element_name, value pairs that could plop into a hash nicely
+    class Bag < Any
+      attr_reader :values
+      def initialize
+        @values = {}
+      end
+      def [](key)
+        @values[key]
+      end
+      def end_child(child)
+        # Don't overwrite, first dupe tends to be authorative one
+        @values[child.tag_name] = child.text unless @values.key?(child.tag_name)
+      end
+    end
+  end
+end

data/lib/eiwa/tag/character.rb ADDED

@@ -0,0 +1,24 @@
+module Eiwa
+  module Tag
+    class Character < Any
+      attr_reader :text,
+        :grade, :stroke_count, :freq, :jlpt,
+        :onyomi, :kunyomi, :meanings
+      def end_child(child)
+        if child.tag_name == "literal"
+          @text = child.text
+        elsif child.tag_name == "reading_meaning"
+          @onyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_on" }.map(&:text)
+          @kunyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_kun" }.map(&:text)
+          @meanings = child.rmgroup.items.select { |item| item.name == "meaning" && (item.attrs["m_lang"].nil? || item.attrs["m_lang"] == "en") }.map(&:text)
+        elsif child.tag_name == "misc"
+          @grade = child["grade"]&.to_i
+          @stroke_count = child["stroke_count"]&.to_i
+          @freq = child["freq"]&.to_i
+          @jlpt = child["jlpt"]&.to_i
+        end
+      end
+    end
+  end
+end

data/lib/eiwa/tag/cross_reference.rb CHANGED

@@ -21,7 +21,7 @@ module Eiwa
           @reading == other.reading &&
           @sense_ordinal == other.sense_ordinal
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
         @text.hash + @reading.hash + @sense_ordinal.hash

data/lib/eiwa/tag/definition.rb CHANGED

@@ -35,7 +35,7 @@ module Eiwa
           @gender == other.gender &&
           @type == other.type
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
         @text.hash + @language.hash + @gender.hash + @type.hash

data/lib/eiwa/tag/entity.rb CHANGED

@@ -1,5 +1,3 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Entity < Any
@@ -19,7 +17,7 @@ module Eiwa
         @code == other.code &&
           @text == other.text
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
         @code.hash + @text.hash

data/lib/eiwa/tag/entry.rb CHANGED

@@ -1,5 +1,3 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Entry < Any

data/lib/eiwa/tag/list.rb ADDED

@@ -0,0 +1,18 @@
+module Eiwa
+  module Tag
+    # For containers of lists or repeated elements
+    class List < Any
+      Item = Struct.new(:name, :attrs, :text, keyword_init: true)
+      attr_reader :items
+      def initialize
+        @items = []
+      end
+      def end_child(child)
+        @items << Item.new(name: child.tag_name, attrs: child.attrs, text: child.text)
+      end
+    end
+  end
+end

data/lib/eiwa/tag/meaning.rb CHANGED

@@ -1,5 +1,3 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Meaning < Any

data/lib/eiwa/tag/other.rb CHANGED

@@ -1,9 +1,11 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Other < Any
-      attr_reader :text
+      attr_reader :attrs
+      def text
+        @characters
+      end
     end
   end
 end

data/lib/eiwa/tag/reading.rb CHANGED

@@ -1,5 +1,3 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Reading < Any

data/lib/eiwa/tag/reading_meaning.rb ADDED

@@ -0,0 +1,11 @@
+module Eiwa
+  module Tag
+    class ReadingMeaning < Any
+      attr_reader :rmgroup
+      def end_child(child)
+        @rmgroup = child if child.tag_name == "rmgroup"
+      end
+    end
+  end
+end

data/lib/eiwa/tag/source_language.rb CHANGED

@@ -23,7 +23,7 @@ module Eiwa
           @wasei == other.wasei &&
           @type == other.type
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
         @text.hash + @language.hash + @wasei.hash + @type.hash

data/lib/eiwa/tag/spelling.rb CHANGED

@@ -1,5 +1,3 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Spelling < Any

data/lib/eiwa/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Eiwa
-  VERSION = "0.0.2"
+  VERSION = "0.1.0"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: eiwa
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.1.0
 platform: ruby
 authors:
 - Justin Searls
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-03-09 00:00:00.000000000 Z
+date: 2021-01-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -94,7 +94,7 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-description:
+description:
 email:
 - searls@gmail.com
 executables: []
@@ -112,18 +112,23 @@ files:
 - bin/setup
 - eiwa.gemspec
 - lib/eiwa.rb
-- lib/eiwa/jmdict_doc.rb
-- lib/eiwa/jmdict_entities.rb
-- lib/eiwa/parses_jmdict_file.rb
+- lib/eiwa/jmdict/doc.rb
+- lib/eiwa/jmdict/entities.rb
+- lib/eiwa/kanjidic/doc.rb
+- lib/eiwa/parses_file.rb
 - lib/eiwa/tag/antonym.rb
 - lib/eiwa/tag/any.rb
+- lib/eiwa/tag/bag.rb
+- lib/eiwa/tag/character.rb
 - lib/eiwa/tag/cross_reference.rb
 - lib/eiwa/tag/definition.rb
 - lib/eiwa/tag/entity.rb
 - lib/eiwa/tag/entry.rb
+- lib/eiwa/tag/list.rb
 - lib/eiwa/tag/meaning.rb
 - lib/eiwa/tag/other.rb
 - lib/eiwa/tag/reading.rb
+- lib/eiwa/tag/reading_meaning.rb
 - lib/eiwa/tag/source_language.rb
 - lib/eiwa/tag/spelling.rb
 - lib/eiwa/version.rb
@@ -133,7 +138,7 @@ homepage: https://github.com/searls/eiwa
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -148,8 +153,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.3
-signing_key:
+rubygems_version: 3.1.4
+signing_key:
 specification_version: 4
 summary: Parses the JMDict Japanese-English dictionary
 test_files: []

data/lib/eiwa/jmdict_doc.rb DELETED

@@ -1,93 +0,0 @@
-require_relative "tag/entry"
-require_relative "tag/spelling"
-require_relative "tag/reading"
-require_relative "tag/meaning"
-require_relative "tag/entity"
-require_relative "tag/cross_reference"
-require_relative "tag/antonym"
-require_relative "tag/source_language"
-require_relative "tag/definition"
-require_relative "tag/other"
-require_relative "jmdict_entities"
-module Eiwa
-  TAGS = {
-    "entry" => Tag::Entry,
-    "k_ele" => Tag::Spelling,
-    "r_ele" => Tag::Reading,
-    "sense" => Tag::Meaning,
-    "pos" => Tag::Entity,
-    "misc" => Tag::Entity,
-    "dial" => Tag::Entity,
-    "field" => Tag::Entity,
-    "ke_inf" => Tag::Entity,
-    "re_inf" => Tag::Entity,
-    "xref" => Tag::CrossReference,
-    "ant" => Tag::Antonym,
-    "lsource" => Tag::SourceLanguage,
-    "gloss" => Tag::Definition
-  }
-  class JmdictDoc < Nokogiri::XML::SAX::Document
-    def initialize(each_entry_block)
-      @each_entry_block = each_entry_block
-    end
-    def start_document
-    end
-    def end_document
-    end
-    def start_element(name, attrs)
-      parent = @current
-      @current = (TAGS[name] || Tag::Other).new
-      @current.start(name, attrs, parent)
-    end
-    def end_element(name)
-      raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
-      ending = @current
-      ending.end_self
-      if ending.is_a?(Tag::Entry)
-        @each_entry_block&.call(ending)
-      end
-      @current = ending.parent
-      @current&.end_child(ending)
-    end
-    def characters(s)
-      @current.add_characters(s)
-    end
-    # def comment string
-    #   puts "comment #{string}"
-    # end
-    # def warning string
-    #   puts "warning #{string}"
-    # end
-    def error(msg)
-      if (matches = msg.match(/Entity '([\S]+)' not defined/))
-        # See: http://github.com/sparklemotion/nokogiri/issues/1926
-        code = matches[1]
-        @current.set_entity(code, JMDICT_ENTITIES[code])
-      elsif msg == "Detected an entity reference loop\n"
-        # Do nothing and hope this does not matter.
-      else
-        raise Eiwa::Error.new("Parsing error: #{msg}")
-      end
-    end
-    # def cdata_block string
-    #   puts "cdata_block #{string}"
-    # end
-    # def processing_instruction name, content
-    #   puts "processing_instruction #{name}, #{content}"
-    # end
-  end
-end

data/lib/eiwa/jmdict_entities.rb DELETED

@@ -1,178 +0,0 @@
-module Eiwa
-  JMDICT_ENTITIES = {
-    "Buddh" => "Buddhist term",
-    "MA" => "martial arts term",
-    "Shinto" => "Shinto term",
-    "X" => "rude or X-rated term (not displayed in educational software)",
-    "abbr" => "abbreviation",
-    "adj-f" => "noun or verb acting prenominally",
-    "adj-i" => "adjective (keiyoushi)",
-    "adj-ix" => "adjective (keiyoushi) - yoi/ii class",
-    "adj-kari" => "`kari' adjective (archaic)",
-    "adj-ku" => "`ku' adjective (archaic)",
-    "adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
-    "adj-nari" => "archaic/formal form of na-adjective",
-    "adj-no" => "nouns which may take the genitive case particle `no'",
-    "adj-pn" => "pre-noun adjectival (rentaishi)",
-    "adj-shiku" => "`shiku' adjective (archaic)",
-    "adj-t" => "`taru' adjective",
-    "adv" => "adverb (fukushi)",
-    "adv-to" => "adverb taking the `to' particle",
-    "anat" => "anatomical term",
-    "arch" => "archaism",
-    "archit" => "architecture term",
-    "astron" => "astronomy, etc. term",
-    "ateji" => "ateji (phonetic) reading",
-    "aux" => "auxiliary",
-    "aux-adj" => "auxiliary adjective",
-    "aux-v" => "auxiliary verb",
-    "baseb" => "baseball term",
-    "biol" => "biology term",
-    "bot" => "botany term",
-    "bus" => "business term",
-    "chem" => "chemistry term",
-    "chn" => "children's language",
-    "col" => "colloquialism",
-    "comp" => "computer terminology",
-    "conj" => "conjunction",
-    "cop" => "copula",
-    "cop-da" => "copula",
-    "ctr" => "counter",
-    "derog" => "derogatory",
-    "eK" => "exclusively kanji",
-    "econ" => "economics term",
-    "ek" => "exclusively kana",
-    "engr" => "engineering term",
-    "exp" => "expressions (phrases, clauses, etc.)",
-    "fam" => "familiar language",
-    "fem" => "female term or language",
-    "finc" => "finance term",
-    "food" => "food term",
-    "geol" => "geology, etc. term",
-    "geom" => "geometry term",
-    "gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
-    "hob" => "Hokkaido-ben",
-    "hon" => "honorific or respectful (sonkeigo) language",
-    "hum" => "humble (kenjougo) language",
-    "iK" => "word containing irregular kanji usage",
-    "id" => "idiomatic expression",
-    "ik" => "word containing irregular kana usage",
-    "int" => "interjection (kandoushi)",
-    "io" => "irregular okurigana usage",
-    "iv" => "irregular verb",
-    "joc" => "jocular, humorous term",
-    "ksb" => "Kansai-ben",
-    "ktb" => "Kantou-ben",
-    "kyb" => "Kyoto-ben",
-    "kyu" => "Kyuushuu-ben",
-    "law" => "law, etc. term",
-    "ling" => "linguistics terminology",
-    "m-sl" => "manga slang",
-    "mahj" => "mahjong term",
-    "male" => "male term or language",
-    "male-sl" => "male slang",
-    "math" => "mathematics",
-    "med" => "medicine, etc. term",
-    "mil" => "military",
-    "music" => "music term",
-    "n" => "noun (common) (futsuumeishi)",
-    "n-adv" => "adverbial noun (fukushitekimeishi)",
-    "n-pr" => "proper noun",
-    "n-pref" => "noun, used as a prefix",
-    "n-suf" => "noun, used as a suffix",
-    "n-t" => "noun (temporal) (jisoumeishi)",
-    "nab" => "Nagano-ben",
-    "num" => "numeric",
-    "oK" => "word containing out-dated kanji",
-    "obs" => "obsolete term",
-    "obsc" => "obscure term",
-    "oik" => "old or irregular kana form",
-    "ok" => "out-dated or obsolete kana usage",
-    "on-mim" => "onomatopoeic or mimetic word",
-    "osb" => "Osaka-ben",
-    "physics" => "physics terminology",
-    "pn" => "pronoun",
-    "poet" => "poetical term",
-    "pol" => "polite (teineigo) language",
-    "pref" => "prefix",
-    "proverb" => "proverb",
-    "prt" => "particle",
-    "quote" => "quotation",
-    "rare" => "rare",
-    "rkb" => "Ryuukyuu-ben",
-    "sens" => "sensitive",
-    "shogi" => "shogi term",
-    "sl" => "slang",
-    "sports" => "sports term",
-    "suf" => "suffix",
-    "sumo" => "sumo term",
-    "thb" => "Touhoku-ben",
-    "tsb" => "Tosa-ben",
-    "tsug" => "Tsugaru-ben",
-    "uK" => "word usually written using kanji alone",
-    "uk" => "word usually written using kana alone",
-    "unc" => "unclassified",
-    "v-unspec" => "verb unspecified",
-    "v1" => "Ichidan verb",
-    "v1-s" => "Ichidan verb - kureru special class",
-    "v2a-s" => "Nidan verb with 'u' ending (archaic)",
-    "v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
-    "v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
-    "v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
-    "v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
-    "v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
-    "v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
-    "v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
-    "v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
-    "v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
-    "v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
-    "v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
-    "v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
-    "v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
-    "v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
-    "v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
-    "v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
-    "v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
-    "v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
-    "v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
-    "v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
-    "v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
-    "v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
-    "v4b" => "Yodan verb with `bu' ending (archaic)",
-    "v4g" => "Yodan verb with `gu' ending (archaic)",
-    "v4h" => "Yodan verb with `hu/fu' ending (archaic)",
-    "v4k" => "Yodan verb with `ku' ending (archaic)",
-    "v4m" => "Yodan verb with `mu' ending (archaic)",
-    "v4n" => "Yodan verb with `nu' ending (archaic)",
-    "v4r" => "Yodan verb with `ru' ending (archaic)",
-    "v4s" => "Yodan verb with `su' ending (archaic)",
-    "v4t" => "Yodan verb with `tsu' ending (archaic)",
-    "v5aru" => "Godan verb - -aru special class",
-    "v5b" => "Godan verb with `bu' ending",
-    "v5g" => "Godan verb with `gu' ending",
-    "v5k" => "Godan verb with `ku' ending",
-    "v5k-s" => "Godan verb - Iku/Yuku special class",
-    "v5m" => "Godan verb with `mu' ending",
-    "v5n" => "Godan verb with `nu' ending",
-    "v5r" => "Godan verb with `ru' ending",
-    "v5r-i" => "Godan verb with `ru' ending (irregular verb)",
-    "v5s" => "Godan verb with `su' ending",
-    "v5t" => "Godan verb with `tsu' ending",
-    "v5u" => "Godan verb with `u' ending",
-    "v5u-s" => "Godan verb with `u' ending (special class)",
-    "v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
-    "vi" => "intransitive verb",
-    "vk" => "Kuru verb - special class",
-    "vn" => "irregular nu verb",
-    "vr" => "irregular ru verb, plain form ends with -ri",
-    "vs" => "noun or participle which takes the aux. verb suru",
-    "vs-c" => "su verb - precursor to the modern suru",
-    "vs-i" => "suru verb - included",
-    "vs-s" => "suru verb - special class",
-    "vt" => "transitive verb",
-    "vulg" => "vulgar expression or word",
-    "vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
-    "yoji" => "yojijukugo",
-    "zool" => "zoology term"
-  }
-end

data/lib/eiwa/parses_jmdict_file.rb DELETED

@@ -1,21 +0,0 @@
-require "nokogiri"
-require_relative "jmdict_doc"
-module Eiwa
-  class ParsesJmdictFile
-    def call(filename, each_entry_block)
-      if each_entry_block.nil?
-        entries = []
-        each_entry_block ||= ->(e) { entries << e }
-      end
-      JmdictDoc.new(each_entry_block).tap do |doc|
-        Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
-          ctx.recovery = true
-        end
-      end
-      entries
-    end
-  end
-end