RubyGems - eiwa - Versions diffs - 0.0.2 → 0.1.0 - Mend

eiwa 0.0.2 → 0.1.0

Files changed (27) hide show

checksums.yaml +4 -4
data/Gemfile.lock +35 -27
data/README.md +22 -13
data/lib/eiwa.rb +19 -7
data/lib/eiwa/jmdict/doc.rb +85 -0
data/lib/eiwa/jmdict/entities.rb +180 -0
data/lib/eiwa/kanjidic/doc.rb +43 -0
data/lib/eiwa/parses_file.rb +35 -0
data/lib/eiwa/tag/antonym.rb +1 -1
data/lib/eiwa/tag/bag.rb +21 -0
data/lib/eiwa/tag/character.rb +24 -0
data/lib/eiwa/tag/cross_reference.rb +1 -1
data/lib/eiwa/tag/definition.rb +1 -1
data/lib/eiwa/tag/entity.rb +1 -3
data/lib/eiwa/tag/entry.rb +0 -2
data/lib/eiwa/tag/list.rb +18 -0
data/lib/eiwa/tag/meaning.rb +0 -2
data/lib/eiwa/tag/other.rb +5 -3
data/lib/eiwa/tag/reading.rb +0 -2
data/lib/eiwa/tag/reading_meaning.rb +11 -0
data/lib/eiwa/tag/source_language.rb +1 -1
data/lib/eiwa/tag/spelling.rb +0 -2
data/lib/eiwa/version.rb +1 -1
metadata +15 -10
data/lib/eiwa/jmdict_doc.rb +0 -93
data/lib/eiwa/jmdict_entities.rb +0 -178
data/lib/eiwa/parses_jmdict_file.rb +0 -21

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c5888e4802408cc8efdb55ddadfb560ec38d10971fee95b20dd53af2f31f487c
-  data.tar.gz: 93b7101b430ee123a905065f87e5d8ba336e1a18f145c7801daeb2b9b9a5ba72
+  metadata.gz: ef277fcf117e28dcbc32fc6e17c8d34f1783b546f9adf5b9452e04d073d31ee8
+  data.tar.gz: d1e744a10c6e688e532da9855790ba1a9fad89338b75f0924ee9a5f90294e3ee
 SHA512:
-  metadata.gz: 64faccd9958b9c359fcd7a7ff40de013bd1060bbf9a59735be4d52c824dd3e6e77abbb81ab42ae037b9175990dd15baa33722503b47d8aaebbb037a8a303f965
-  data.tar.gz: 209efe931acfa8563ea1819f4e9b5a7d07a996d1545458b139de14f82971b27f47e0da726ed5151b3922aa05f6c5809bd3196269e0f6d254cb87e801c81ffe6e
+  metadata.gz: '090773b16ffc636c53cbd957e5f6131449c455bf4e00c39908f78403c2c284bd7364b9e33ea9f6d8ac9a930777cfdc1f67e296d701ee7224fd6572da812f5f93'
+  data.tar.gz: d43f3a9ded86a0a3238eaac28c86b1d3d7b95dbe3d7a7c6eb8fe389014c7393600009f0c79f69cf7b06ff8dcd84650c0fa679524703cd36207b795b994892375

data/Gemfile.lock CHANGED

@@ -1,42 +1,50 @@
 PATH
   remote: .
   specs:
-    eiwa (0.0.2)
+    eiwa (0.1.0)
       nokogiri
 GEM
   remote: https://rubygems.org/
   specs:
-    ast (2.4.0)
-    coderay (1.1.2)
-    jaro_winkler (1.5.3)
-    method_source (0.9.2)
-    mini_portile2 (2.4.0)
-    minitest (5.11.3)
-    nokogiri (1.10.9)
-      mini_portile2 (~> 2.4.0)
-    parallel (1.17.0)
-    parser (2.6.4.1)
-      ast (~> 2.4.0)
-    pry (0.12.2)
-      coderay (~> 1.1.0)
-      method_source (~> 0.9.0)
+    ast (2.4.1)
+    coderay (1.1.3)
+    method_source (1.0.0)
+    mini_portile2 (2.5.0)
+    minitest (5.14.3)
+    nokogiri (1.11.1)
+      mini_portile2 (~> 2.5.0)
+      racc (~> 1.4)
+    parallel (1.20.1)
+    parser (3.0.0.0)
+      ast (~> 2.4.1)
+    pry (0.13.1)
+      coderay (~> 1.1)
+      method_source (~> 1.0)
+    racc (1.5.2)
     rainbow (3.0.0)
-    rake (13.0.1)
-    rubocop (0.72.0)
-      jaro_winkler (~> 1.5.1)
+    rake (13.0.3)
+    regexp_parser (2.0.3)
+    rexml (3.2.4)
+    rubocop (1.7.0)
       parallel (~> 1.10)
-      parser (>= 2.6)
+      parser (>= 2.7.1.5)
       rainbow (>= 2.2.2, < 4.0)
+      regexp_parser (>= 1.8, < 3.0)
+      rexml
+      rubocop-ast (>= 1.2.0, < 2.0)
       ruby-progressbar (~> 1.7)
-      unicode-display_width (>= 1.4.0, < 1.7)
-    rubocop-performance (1.4.1)
-      rubocop (>= 0.71.0)
-    ruby-progressbar (1.10.1)
-    standard (0.1.4)
-      rubocop (~> 0.72.0)
-      rubocop-performance (~> 1.4.0)
-    unicode-display_width (1.6.0)
+      unicode-display_width (>= 1.4.0, < 2.0)
+    rubocop-ast (1.4.0)
+      parser (>= 2.7.1.5)
+    rubocop-performance (1.9.2)
+      rubocop (>= 0.90.0, < 2.0)
+      rubocop-ast (>= 0.4.0)
+    ruby-progressbar (1.11.0)
+    standard (0.11.0)
+      rubocop (= 1.7.0)
+      rubocop-performance (= 1.9.2)
+    unicode-display_width (1.7.0)
 PLATFORMS
   ruby

data/README.md CHANGED

@@ -1,7 +1,12 @@
 # eiwa / 英和
-Parses the Japanese-English version of JMDict, a daily export of the WWWJDIC
-online Japanese dictionary.
+Parses two types of Japanese-English dictionaries:
+* `:jmdict_e` - [JMDict](http://www.edrdg.org/jmdict/edict_doc.html)'s
+  English-only export of the WWWJDIC online Japanese dictionary.
+* `:kanjidic2` - the
+  [KANJIDIC2](http://www.edrdg.org/wiki/index.php/KANJIDIC_Project) dictionary
+  of roughly 13,000 kanji characters
 ## Usage
@@ -23,15 +28,24 @@ gem 'eiwa'
 Get your hands on a supported dictionary. Right now eiwa only parses
 [JMDict](http://www.edrdg.org/jmdict/j_jmdict.html), which can be fetched from
-the [Monash ftp site](http://ftp.monash.edu/pub/nihongo/00INDEX.html) or with a
+the [EDRDG ftp site](http://ftp.edrdg.org/pub/Nihongo/00INDEX.html) or with a
 script like this, for the Japanese-English export:
 ```bash
-curl http://ftp.monash.edu/pub/nihongo/JMdict_e -o jmdict.xml
+# Download JMDICT-E:
+$ curl http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz -o jmdict.xml.gz"
+# Unzip to jmdict.xml
+$ gunzip jmdict.xml.gz
+# Download KANJIDIC2:
+$ curl http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -o kanjidic2.xml.gz
+# Unzip to kanjidic2.xml
+$ gunzip kanjidic2.xml.gz
 ```
-This file is updated daily, and is essentially an export of all vocabulary on
-the [WWWJDIC application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
+These files are updated daily, and are essentially an export of all vocabulary
+and kanji in the [WWWJDIC
+application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
 ### Parse the dictionary
@@ -44,13 +58,11 @@ array and one that will invoke a provided block with each entry, but which won't
 retain a reference to the entries, allowing Ruby to garbage collect them as it
 goes.
-Parsing the dictionary is CPU intensive, and takes about 13 seconds on my 2019
-13" MacBook Pro.
 #### Passing a block
 If you just want to do some processing on each entry, it probably makes sense to
-invoke the library by passing a block
+invoke the library by passing a block (note that supported types include only
+`:jmdict_e` and `:kanjidic2`)
 ```ruby
 Eiwa.parse_file("path/to/some.xml", type: :jmdict_e) do |entry|
@@ -74,6 +86,3 @@ entries = Eiwa.parse_file("path/to/some.xml", type: :jmdict_e)
 Note that for the abridged Japanese-English dictionary, this will consume about
 500MB of RAM.
-### The entry object model
-I haven't documented the [Entry](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag/entry.rb) type or its child types yet, but they should be pretty easy to piece together by inspecting the output and [checking the source listings](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag).

data/lib/eiwa.rb CHANGED

@@ -1,15 +1,27 @@
 require "eiwa/version"
-require "eiwa/parses_jmdict_file"
+require "eiwa/tag/any"
+require "eiwa/tag/character"
+require "eiwa/tag/bag"
+require "eiwa/tag/list"
+require "eiwa/tag/reading_meaning"
+require "eiwa/tag/entry"
+require "eiwa/tag/spelling"
+require "eiwa/tag/reading"
+require "eiwa/tag/meaning"
+require "eiwa/tag/entity"
+require "eiwa/tag/cross_reference"
+require "eiwa/tag/antonym"
+require "eiwa/tag/source_language"
+require "eiwa/tag/definition"
+require "eiwa/tag/other"
+require "eiwa/parses_file"
 module Eiwa
   class Error < StandardError; end
   def self.parse_file(filename, type: :jmdict_e, &each_entry_block)
-    case type
-    when :jmdict_e
-      ParsesJmdictFile.new.call(filename, each_entry_block)
-    else
-      raise Eiwa::Error.new("Unknown file type: #{type}")
-    end
+    ParsesFile.new.call(filename, type, each_entry_block)
   end
 end

data/lib/eiwa/jmdict/doc.rb ADDED

@@ -0,0 +1,85 @@
+require_relative "entities"
+module Eiwa
+  module Jmdict
+    TAGS = {
+      "entry" => Tag::Entry,
+      "k_ele" => Tag::Spelling,
+      "r_ele" => Tag::Reading,
+      "sense" => Tag::Meaning,
+      "pos" => Tag::Entity,
+      "misc" => Tag::Entity,
+      "dial" => Tag::Entity,
+      "field" => Tag::Entity,
+      "ke_inf" => Tag::Entity,
+      "re_inf" => Tag::Entity,
+      "xref" => Tag::CrossReference,
+      "ant" => Tag::Antonym,
+      "lsource" => Tag::SourceLanguage,
+      "gloss" => Tag::Definition
+    }
+    class Doc < Nokogiri::XML::SAX::Document
+      def initialize(each_entry_block)
+        @each_entry_block = each_entry_block
+        @current = nil
+      end
+      def start_document
+      end
+      def end_document
+      end
+      def start_element(name, attrs)
+        parent = @current
+        @current = (TAGS[name] || Tag::Other).new
+        @current.start(name, attrs, parent)
+      end
+      def end_element(name)
+        raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
+        ending = @current
+        ending.end_self
+        if ending.is_a?(Tag::Entry)
+          @each_entry_block&.call(ending)
+        end
+        @current = ending.parent
+        @current&.end_child(ending)
+      end
+      def characters(s)
+        @current.add_characters(s)
+      end
+      # def comment string
+      #   puts "comment #{string}"
+      # end
+      # def warning string
+      #   puts "warning #{string}"
+      # end
+      def error(msg)
+        if (matches = msg.match(/Entity '(\S+)' not defined/))
+          # See: http://github.com/sparklemotion/nokogiri/issues/1926
+          code = matches[1]
+          @current.set_entity(code, ENTITIES[code])
+        elsif msg == "Detected an entity reference loop\n"
+          # Do nothing and hope this does not matter.
+        else
+          raise Eiwa::Error.new("Parsing error: #{msg}")
+        end
+      end
+      # def cdata_block string
+      #   puts "cdata_block #{string}"
+      # end
+      # def processing_instruction name, content
+      #   puts "processing_instruction #{name}, #{content}"
+      # end
+    end
+  end
+end

data/lib/eiwa/jmdict/entities.rb ADDED

@@ -0,0 +1,180 @@
+module Eiwa
+  module Jmdict
+    ENTITIES = {
+      "Buddh" => "Buddhist term",
+      "MA" => "martial arts term",
+      "Shinto" => "Shinto term",
+      "X" => "rude or X-rated term (not displayed in educational software)",
+      "abbr" => "abbreviation",
+      "adj-f" => "noun or verb acting prenominally",
+      "adj-i" => "adjective (keiyoushi)",
+      "adj-ix" => "adjective (keiyoushi) - yoi/ii class",
+      "adj-kari" => "`kari' adjective (archaic)",
+      "adj-ku" => "`ku' adjective (archaic)",
+      "adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
+      "adj-nari" => "archaic/formal form of na-adjective",
+      "adj-no" => "nouns which may take the genitive case particle `no'",
+      "adj-pn" => "pre-noun adjectival (rentaishi)",
+      "adj-shiku" => "`shiku' adjective (archaic)",
+      "adj-t" => "`taru' adjective",
+      "adv" => "adverb (fukushi)",
+      "adv-to" => "adverb taking the `to' particle",
+      "anat" => "anatomical term",
+      "arch" => "archaism",
+      "archit" => "architecture term",
+      "astron" => "astronomy, etc. term",
+      "ateji" => "ateji (phonetic) reading",
+      "aux" => "auxiliary",
+      "aux-adj" => "auxiliary adjective",
+      "aux-v" => "auxiliary verb",
+      "baseb" => "baseball term",
+      "biol" => "biology term",
+      "bot" => "botany term",
+      "bus" => "business term",
+      "chem" => "chemistry term",
+      "chn" => "children's language",
+      "col" => "colloquialism",
+      "comp" => "computer terminology",
+      "conj" => "conjunction",
+      "cop" => "copula",
+      "cop-da" => "copula",
+      "ctr" => "counter",
+      "derog" => "derogatory",
+      "eK" => "exclusively kanji",
+      "econ" => "economics term",
+      "ek" => "exclusively kana",
+      "engr" => "engineering term",
+      "exp" => "expressions (phrases, clauses, etc.)",
+      "fam" => "familiar language",
+      "fem" => "female term or language",
+      "finc" => "finance term",
+      "food" => "food term",
+      "geol" => "geology, etc. term",
+      "geom" => "geometry term",
+      "gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
+      "hob" => "Hokkaido-ben",
+      "hon" => "honorific or respectful (sonkeigo) language",
+      "hum" => "humble (kenjougo) language",
+      "iK" => "word containing irregular kanji usage",
+      "id" => "idiomatic expression",
+      "ik" => "word containing irregular kana usage",
+      "int" => "interjection (kandoushi)",
+      "io" => "irregular okurigana usage",
+      "iv" => "irregular verb",
+      "joc" => "jocular, humorous term",
+      "ksb" => "Kansai-ben",
+      "ktb" => "Kantou-ben",
+      "kyb" => "Kyoto-ben",
+      "kyu" => "Kyuushuu-ben",
+      "law" => "law, etc. term",
+      "ling" => "linguistics terminology",
+      "m-sl" => "manga slang",
+      "mahj" => "mahjong term",
+      "male" => "male term or language",
+      "male-sl" => "male slang",
+      "math" => "mathematics",
+      "med" => "medicine, etc. term",
+      "mil" => "military",
+      "music" => "music term",
+      "n" => "noun (common) (futsuumeishi)",
+      "n-adv" => "adverbial noun (fukushitekimeishi)",
+      "n-pr" => "proper noun",
+      "n-pref" => "noun, used as a prefix",
+      "n-suf" => "noun, used as a suffix",
+      "n-t" => "noun (temporal) (jisoumeishi)",
+      "nab" => "Nagano-ben",
+      "num" => "numeric",
+      "oK" => "word containing out-dated kanji",
+      "obs" => "obsolete term",
+      "obsc" => "obscure term",
+      "oik" => "old or irregular kana form",
+      "ok" => "out-dated or obsolete kana usage",
+      "on-mim" => "onomatopoeic or mimetic word",
+      "osb" => "Osaka-ben",
+      "physics" => "physics terminology",
+      "pn" => "pronoun",
+      "poet" => "poetical term",
+      "pol" => "polite (teineigo) language",
+      "pref" => "prefix",
+      "proverb" => "proverb",
+      "prt" => "particle",
+      "quote" => "quotation",
+      "rare" => "rare",
+      "rkb" => "Ryuukyuu-ben",
+      "sens" => "sensitive",
+      "shogi" => "shogi term",
+      "sl" => "slang",
+      "sports" => "sports term",
+      "suf" => "suffix",
+      "sumo" => "sumo term",
+      "thb" => "Touhoku-ben",
+      "tsb" => "Tosa-ben",
+      "tsug" => "Tsugaru-ben",
+      "uK" => "word usually written using kanji alone",
+      "uk" => "word usually written using kana alone",
+      "unc" => "unclassified",
+      "v-unspec" => "verb unspecified",
+      "v1" => "Ichidan verb",
+      "v1-s" => "Ichidan verb - kureru special class",
+      "v2a-s" => "Nidan verb with 'u' ending (archaic)",
+      "v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
+      "v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
+      "v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
+      "v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
+      "v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
+      "v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
+      "v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
+      "v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
+      "v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
+      "v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
+      "v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
+      "v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
+      "v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
+      "v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
+      "v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
+      "v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
+      "v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
+      "v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
+      "v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
+      "v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
+      "v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
+      "v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
+      "v4b" => "Yodan verb with `bu' ending (archaic)",
+      "v4g" => "Yodan verb with `gu' ending (archaic)",
+      "v4h" => "Yodan verb with `hu/fu' ending (archaic)",
+      "v4k" => "Yodan verb with `ku' ending (archaic)",
+      "v4m" => "Yodan verb with `mu' ending (archaic)",
+      "v4n" => "Yodan verb with `nu' ending (archaic)",
+      "v4r" => "Yodan verb with `ru' ending (archaic)",
+      "v4s" => "Yodan verb with `su' ending (archaic)",
+      "v4t" => "Yodan verb with `tsu' ending (archaic)",
+      "v5aru" => "Godan verb - -aru special class",
+      "v5b" => "Godan verb with `bu' ending",
+      "v5g" => "Godan verb with `gu' ending",
+      "v5k" => "Godan verb with `ku' ending",
+      "v5k-s" => "Godan verb - Iku/Yuku special class",
+      "v5m" => "Godan verb with `mu' ending",
+      "v5n" => "Godan verb with `nu' ending",
+      "v5r" => "Godan verb with `ru' ending",
+      "v5r-i" => "Godan verb with `ru' ending (irregular verb)",
+      "v5s" => "Godan verb with `su' ending",
+      "v5t" => "Godan verb with `tsu' ending",
+      "v5u" => "Godan verb with `u' ending",
+      "v5u-s" => "Godan verb with `u' ending (special class)",
+      "v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
+      "vi" => "intransitive verb",
+      "vk" => "Kuru verb - special class",
+      "vn" => "irregular nu verb",
+      "vr" => "irregular ru verb, plain form ends with -ri",
+      "vs" => "noun or participle which takes the aux. verb suru",
+      "vs-c" => "su verb - precursor to the modern suru",
+      "vs-i" => "suru verb - included",
+      "vs-s" => "suru verb - special class",
+      "vt" => "transitive verb",
+      "vulg" => "vulgar expression or word",
+      "vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
+      "yoji" => "yojijukugo",
+      "zool" => "zoology term"
+    }
+  end
+end

data/lib/eiwa/kanjidic/doc.rb ADDED

@@ -0,0 +1,43 @@
+module Eiwa
+  module Kanjidic
+    TAGS = {
+      "character" => Tag::Character,
+      "misc" => Tag::Bag,
+      "reading_meaning" => Tag::ReadingMeaning,
+      "rmgroup" => Tag::List
+    }
+    class Doc < Nokogiri::XML::SAX::Document
+      def initialize(each_entry_block)
+        @each_entry_block = each_entry_block
+        @current = nil
+      end
+      def start_element(name, attrs)
+        parent = @current
+        @current = (TAGS[name] || Tag::Other).new
+        @current.start(name, attrs, parent)
+      end
+      def end_element(name)
+        raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
+        ending = @current
+        ending.end_self
+        if ending.is_a?(Tag::Character)
+          @each_entry_block&.call(ending)
+        end
+        @current = ending.parent
+        @current&.end_child(ending)
+      end
+      def characters(s)
+        @current.add_characters(s)
+      end
+      def error(msg)
+        raise Eiwa::Error.new("Parsing error: #{msg}")
+      end
+    end
+  end
+end

data/lib/eiwa/parses_file.rb ADDED

@@ -0,0 +1,35 @@
+require "nokogiri"
+require_relative "jmdict/doc"
+require_relative "kanjidic/doc"
+module Eiwa
+  class ParsesFile
+    def call(filename, type, each_entry_block)
+      if each_entry_block.nil?
+        entries = []
+        each_entry_block ||= ->(e) { entries << e }
+      end
+      doc_for(type).new(each_entry_block).tap do |doc|
+        Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
+          ctx.recovery = true
+        end
+      end
+      entries
+    end
+    private
+    def doc_for(type)
+      case type
+      when :jmdict_e
+        Jmdict::Doc
+      when :kanjidic2
+        Kanjidic::Doc
+      else
+        raise Eiwa::Error.new("Unknown file type: #{type}")
+      end
+    end
+  end
+end

data/lib/eiwa/tag/antonym.rb CHANGED

@@ -18,7 +18,7 @@ module Eiwa
         @text == other.text &&
           @sense_ordinal == other.sense_ordinal
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
         @text.hash + @sense_ordinal.hash

data/lib/eiwa/tag/bag.rb ADDED

@@ -0,0 +1,21 @@
+module Eiwa
+  module Tag
+    # For simple elements that contain child element_name, value pairs that could plop into a hash nicely
+    class Bag < Any
+      attr_reader :values
+      def initialize
+        @values = {}
+      end
+      def [](key)
+        @values[key]
+      end
+      def end_child(child)
+        # Don't overwrite, first dupe tends to be authorative one
+        @values[child.tag_name] = child.text unless @values.key?(child.tag_name)
+      end
+    end
+  end
+end

data/lib/eiwa/tag/character.rb ADDED

@@ -0,0 +1,24 @@
+module Eiwa
+  module Tag
+    class Character < Any
+      attr_reader :text,
+        :grade, :stroke_count, :freq, :jlpt,
+        :onyomi, :kunyomi, :meanings
+      def end_child(child)
+        if child.tag_name == "literal"
+          @text = child.text
+        elsif child.tag_name == "reading_meaning"
+          @onyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_on" }.map(&:text)
+          @kunyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_kun" }.map(&:text)
+          @meanings = child.rmgroup.items.select { |item| item.name == "meaning" && (item.attrs["m_lang"].nil? || item.attrs["m_lang"] == "en") }.map(&:text)
+        elsif child.tag_name == "misc"
+          @grade = child["grade"]&.to_i
+          @stroke_count = child["stroke_count"]&.to_i
+          @freq = child["freq"]&.to_i
+          @jlpt = child["jlpt"]&.to_i
+        end
+      end
+    end
+  end
+end

data/lib/eiwa/tag/cross_reference.rb CHANGED

@@ -21,7 +21,7 @@ module Eiwa
           @reading == other.reading &&
           @sense_ordinal == other.sense_ordinal
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
         @text.hash + @reading.hash + @sense_ordinal.hash

data/lib/eiwa/tag/definition.rb CHANGED

@@ -35,7 +35,7 @@ module Eiwa
           @gender == other.gender &&
           @type == other.type
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
         @text.hash + @language.hash + @gender.hash + @type.hash

data/lib/eiwa/tag/entity.rb CHANGED

@@ -1,5 +1,3 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Entity < Any
@@ -19,7 +17,7 @@ module Eiwa
         @code == other.code &&
           @text == other.text
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
         @code.hash + @text.hash

data/lib/eiwa/tag/entry.rb CHANGED

@@ -1,5 +1,3 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Entry < Any

data/lib/eiwa/tag/list.rb ADDED

@@ -0,0 +1,18 @@
+module Eiwa
+  module Tag
+    # For containers of lists or repeated elements
+    class List < Any
+      Item = Struct.new(:name, :attrs, :text, keyword_init: true)
+      attr_reader :items
+      def initialize
+        @items = []
+      end
+      def end_child(child)
+        @items << Item.new(name: child.tag_name, attrs: child.attrs, text: child.text)
+      end
+    end
+  end
+end

data/lib/eiwa/tag/meaning.rb CHANGED

@@ -1,5 +1,3 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Meaning < Any

data/lib/eiwa/tag/other.rb CHANGED

@@ -1,9 +1,11 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Other < Any
-      attr_reader :text
+      attr_reader :attrs
+      def text
+        @characters
+      end
     end
   end
 end

data/lib/eiwa/tag/reading.rb CHANGED

@@ -1,5 +1,3 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Reading < Any

data/lib/eiwa/tag/reading_meaning.rb ADDED

@@ -0,0 +1,11 @@
+module Eiwa
+  module Tag
+    class ReadingMeaning < Any
+      attr_reader :rmgroup
+      def end_child(child)
+        @rmgroup = child if child.tag_name == "rmgroup"
+      end
+    end
+  end
+end

data/lib/eiwa/tag/source_language.rb CHANGED

@@ -23,7 +23,7 @@ module Eiwa
           @wasei == other.wasei &&
           @type == other.type
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
         @text.hash + @language.hash + @wasei.hash + @type.hash

data/lib/eiwa/tag/spelling.rb CHANGED

@@ -1,5 +1,3 @@
-require_relative "any"
 module Eiwa
   module Tag
     class Spelling < Any

data/lib/eiwa/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Eiwa
-  VERSION = "0.0.2"
+  VERSION = "0.1.0"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: eiwa
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.1.0
 platform: ruby
 authors:
 - Justin Searls
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-03-09 00:00:00.000000000 Z
+date: 2021-01-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -94,7 +94,7 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-description:
+description:
 email:
 - searls@gmail.com
 executables: []
@@ -112,18 +112,23 @@ files:
 - bin/setup
 - eiwa.gemspec
 - lib/eiwa.rb
-- lib/eiwa/jmdict_doc.rb
-- lib/eiwa/jmdict_entities.rb
-- lib/eiwa/parses_jmdict_file.rb
+- lib/eiwa/jmdict/doc.rb
+- lib/eiwa/jmdict/entities.rb
+- lib/eiwa/kanjidic/doc.rb
+- lib/eiwa/parses_file.rb
 - lib/eiwa/tag/antonym.rb
 - lib/eiwa/tag/any.rb
+- lib/eiwa/tag/bag.rb
+- lib/eiwa/tag/character.rb
 - lib/eiwa/tag/cross_reference.rb
 - lib/eiwa/tag/definition.rb
 - lib/eiwa/tag/entity.rb
 - lib/eiwa/tag/entry.rb
+- lib/eiwa/tag/list.rb
 - lib/eiwa/tag/meaning.rb
 - lib/eiwa/tag/other.rb
 - lib/eiwa/tag/reading.rb
+- lib/eiwa/tag/reading_meaning.rb
 - lib/eiwa/tag/source_language.rb
 - lib/eiwa/tag/spelling.rb
 - lib/eiwa/version.rb
@@ -133,7 +138,7 @@ homepage: https://github.com/searls/eiwa
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -148,8 +153,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.3
-signing_key:
+rubygems_version: 3.1.4
+signing_key:
 specification_version: 4
 summary: Parses the JMDict Japanese-English dictionary
 test_files: []

data/lib/eiwa/jmdict_doc.rb DELETED

@@ -1,93 +0,0 @@
-require_relative "tag/entry"
-require_relative "tag/spelling"
-require_relative "tag/reading"
-require_relative "tag/meaning"
-require_relative "tag/entity"
-require_relative "tag/cross_reference"
-require_relative "tag/antonym"
-require_relative "tag/source_language"
-require_relative "tag/definition"
-require_relative "tag/other"
-require_relative "jmdict_entities"
-module Eiwa
-  TAGS = {
-    "entry" => Tag::Entry,
-    "k_ele" => Tag::Spelling,
-    "r_ele" => Tag::Reading,
-    "sense" => Tag::Meaning,
-    "pos" => Tag::Entity,
-    "misc" => Tag::Entity,
-    "dial" => Tag::Entity,
-    "field" => Tag::Entity,
-    "ke_inf" => Tag::Entity,
-    "re_inf" => Tag::Entity,
-    "xref" => Tag::CrossReference,
-    "ant" => Tag::Antonym,
-    "lsource" => Tag::SourceLanguage,
-    "gloss" => Tag::Definition
-  }
-  class JmdictDoc < Nokogiri::XML::SAX::Document
-    def initialize(each_entry_block)
-      @each_entry_block = each_entry_block
-    end
-    def start_document
-    end
-    def end_document
-    end
-    def start_element(name, attrs)
-      parent = @current
-      @current = (TAGS[name] || Tag::Other).new
-      @current.start(name, attrs, parent)
-    end
-    def end_element(name)
-      raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
-      ending = @current
-      ending.end_self
-      if ending.is_a?(Tag::Entry)
-        @each_entry_block&.call(ending)
-      end
-      @current = ending.parent
-      @current&.end_child(ending)
-    end
-    def characters(s)
-      @current.add_characters(s)
-    end
-    # def comment string
-    #   puts "comment #{string}"
-    # end
-    # def warning string
-    #   puts "warning #{string}"
-    # end
-    def error(msg)
-      if (matches = msg.match(/Entity '([\S]+)' not defined/))
-        # See: http://github.com/sparklemotion/nokogiri/issues/1926
-        code = matches[1]
-        @current.set_entity(code, JMDICT_ENTITIES[code])
-      elsif msg == "Detected an entity reference loop\n"
-        # Do nothing and hope this does not matter.
-      else
-        raise Eiwa::Error.new("Parsing error: #{msg}")
-      end
-    end
-    # def cdata_block string
-    #   puts "cdata_block #{string}"
-    # end
-    # def processing_instruction name, content
-    #   puts "processing_instruction #{name}, #{content}"
-    # end
-  end
-end

data/lib/eiwa/jmdict_entities.rb DELETED

@@ -1,178 +0,0 @@
-module Eiwa
-  JMDICT_ENTITIES = {
-    "Buddh" => "Buddhist term",
-    "MA" => "martial arts term",
-    "Shinto" => "Shinto term",
-    "X" => "rude or X-rated term (not displayed in educational software)",
-    "abbr" => "abbreviation",
-    "adj-f" => "noun or verb acting prenominally",
-    "adj-i" => "adjective (keiyoushi)",
-    "adj-ix" => "adjective (keiyoushi) - yoi/ii class",
-    "adj-kari" => "`kari' adjective (archaic)",
-    "adj-ku" => "`ku' adjective (archaic)",
-    "adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
-    "adj-nari" => "archaic/formal form of na-adjective",
-    "adj-no" => "nouns which may take the genitive case particle `no'",
-    "adj-pn" => "pre-noun adjectival (rentaishi)",
-    "adj-shiku" => "`shiku' adjective (archaic)",
-    "adj-t" => "`taru' adjective",
-    "adv" => "adverb (fukushi)",
-    "adv-to" => "adverb taking the `to' particle",
-    "anat" => "anatomical term",
-    "arch" => "archaism",
-    "archit" => "architecture term",
-    "astron" => "astronomy, etc. term",
-    "ateji" => "ateji (phonetic) reading",
-    "aux" => "auxiliary",
-    "aux-adj" => "auxiliary adjective",
-    "aux-v" => "auxiliary verb",
-    "baseb" => "baseball term",
-    "biol" => "biology term",
-    "bot" => "botany term",
-    "bus" => "business term",
-    "chem" => "chemistry term",
-    "chn" => "children's language",
-    "col" => "colloquialism",
-    "comp" => "computer terminology",
-    "conj" => "conjunction",
-    "cop" => "copula",
-    "cop-da" => "copula",
-    "ctr" => "counter",
-    "derog" => "derogatory",
-    "eK" => "exclusively kanji",
-    "econ" => "economics term",
-    "ek" => "exclusively kana",
-    "engr" => "engineering term",
-    "exp" => "expressions (phrases, clauses, etc.)",
-    "fam" => "familiar language",
-    "fem" => "female term or language",
-    "finc" => "finance term",
-    "food" => "food term",
-    "geol" => "geology, etc. term",
-    "geom" => "geometry term",
-    "gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
-    "hob" => "Hokkaido-ben",
-    "hon" => "honorific or respectful (sonkeigo) language",
-    "hum" => "humble (kenjougo) language",
-    "iK" => "word containing irregular kanji usage",
-    "id" => "idiomatic expression",
-    "ik" => "word containing irregular kana usage",
-    "int" => "interjection (kandoushi)",
-    "io" => "irregular okurigana usage",
-    "iv" => "irregular verb",
-    "joc" => "jocular, humorous term",
-    "ksb" => "Kansai-ben",
-    "ktb" => "Kantou-ben",
-    "kyb" => "Kyoto-ben",
-    "kyu" => "Kyuushuu-ben",
-    "law" => "law, etc. term",
-    "ling" => "linguistics terminology",
-    "m-sl" => "manga slang",
-    "mahj" => "mahjong term",
-    "male" => "male term or language",
-    "male-sl" => "male slang",
-    "math" => "mathematics",
-    "med" => "medicine, etc. term",
-    "mil" => "military",
-    "music" => "music term",
-    "n" => "noun (common) (futsuumeishi)",
-    "n-adv" => "adverbial noun (fukushitekimeishi)",
-    "n-pr" => "proper noun",
-    "n-pref" => "noun, used as a prefix",
-    "n-suf" => "noun, used as a suffix",
-    "n-t" => "noun (temporal) (jisoumeishi)",
-    "nab" => "Nagano-ben",
-    "num" => "numeric",
-    "oK" => "word containing out-dated kanji",
-    "obs" => "obsolete term",
-    "obsc" => "obscure term",
-    "oik" => "old or irregular kana form",
-    "ok" => "out-dated or obsolete kana usage",
-    "on-mim" => "onomatopoeic or mimetic word",
-    "osb" => "Osaka-ben",
-    "physics" => "physics terminology",
-    "pn" => "pronoun",
-    "poet" => "poetical term",
-    "pol" => "polite (teineigo) language",
-    "pref" => "prefix",
-    "proverb" => "proverb",
-    "prt" => "particle",
-    "quote" => "quotation",
-    "rare" => "rare",
-    "rkb" => "Ryuukyuu-ben",
-    "sens" => "sensitive",
-    "shogi" => "shogi term",
-    "sl" => "slang",
-    "sports" => "sports term",
-    "suf" => "suffix",
-    "sumo" => "sumo term",
-    "thb" => "Touhoku-ben",
-    "tsb" => "Tosa-ben",
-    "tsug" => "Tsugaru-ben",
-    "uK" => "word usually written using kanji alone",
-    "uk" => "word usually written using kana alone",
-    "unc" => "unclassified",
-    "v-unspec" => "verb unspecified",
-    "v1" => "Ichidan verb",
-    "v1-s" => "Ichidan verb - kureru special class",
-    "v2a-s" => "Nidan verb with 'u' ending (archaic)",
-    "v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
-    "v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
-    "v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
-    "v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
-    "v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
-    "v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
-    "v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
-    "v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
-    "v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
-    "v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
-    "v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
-    "v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
-    "v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
-    "v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
-    "v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
-    "v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
-    "v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
-    "v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
-    "v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
-    "v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
-    "v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
-    "v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
-    "v4b" => "Yodan verb with `bu' ending (archaic)",
-    "v4g" => "Yodan verb with `gu' ending (archaic)",
-    "v4h" => "Yodan verb with `hu/fu' ending (archaic)",
-    "v4k" => "Yodan verb with `ku' ending (archaic)",
-    "v4m" => "Yodan verb with `mu' ending (archaic)",
-    "v4n" => "Yodan verb with `nu' ending (archaic)",
-    "v4r" => "Yodan verb with `ru' ending (archaic)",
-    "v4s" => "Yodan verb with `su' ending (archaic)",
-    "v4t" => "Yodan verb with `tsu' ending (archaic)",
-    "v5aru" => "Godan verb - -aru special class",
-    "v5b" => "Godan verb with `bu' ending",
-    "v5g" => "Godan verb with `gu' ending",
-    "v5k" => "Godan verb with `ku' ending",
-    "v5k-s" => "Godan verb - Iku/Yuku special class",
-    "v5m" => "Godan verb with `mu' ending",
-    "v5n" => "Godan verb with `nu' ending",
-    "v5r" => "Godan verb with `ru' ending",
-    "v5r-i" => "Godan verb with `ru' ending (irregular verb)",
-    "v5s" => "Godan verb with `su' ending",
-    "v5t" => "Godan verb with `tsu' ending",
-    "v5u" => "Godan verb with `u' ending",
-    "v5u-s" => "Godan verb with `u' ending (special class)",
-    "v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
-    "vi" => "intransitive verb",
-    "vk" => "Kuru verb - special class",
-    "vn" => "irregular nu verb",
-    "vr" => "irregular ru verb, plain form ends with -ri",
-    "vs" => "noun or participle which takes the aux. verb suru",
-    "vs-c" => "su verb - precursor to the modern suru",
-    "vs-i" => "suru verb - included",
-    "vs-s" => "suru verb - special class",
-    "vt" => "transitive verb",
-    "vulg" => "vulgar expression or word",
-    "vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
-    "yoji" => "yojijukugo",
-    "zool" => "zoology term"
-  }
-end

data/lib/eiwa/parses_jmdict_file.rb DELETED

@@ -1,21 +0,0 @@
-require "nokogiri"
-require_relative "jmdict_doc"
-module Eiwa
-  class ParsesJmdictFile
-    def call(filename, each_entry_block)
-      if each_entry_block.nil?
-        entries = []
-        each_entry_block ||= ->(e) { entries << e }
-      end
-      JmdictDoc.new(each_entry_block).tap do |doc|
-        Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
-          ctx.recovery = true
-        end
-      end
-      entries
-    end
-  end
-end