RubyGems - eiwa - Versions diffs - 0.0.2 → 0.1.1 - Mend

eiwa 0.0.2 → 0.1.1

Files changed (32) hide show

checksums.yaml +4 -4
data/.github/workflows/ruby.yml +5 -8
data/.standard.yml +1 -0
data/Gemfile +5 -1
data/Gemfile.lock +61 -35
data/README.md +22 -13
data/eiwa.gemspec +1 -6
data/lib/eiwa/jmdict/doc.rb +85 -0
data/lib/eiwa/jmdict/entities.rb +180 -0
data/lib/eiwa/kanjidic/doc.rb +43 -0
data/lib/eiwa/parses_file.rb +35 -0
data/lib/eiwa/tag/antonym.rb +2 -2
data/lib/eiwa/tag/any.rb +1 -1
data/lib/eiwa/tag/bag.rb +21 -0
data/lib/eiwa/tag/character.rb +24 -0
data/lib/eiwa/tag/cross_reference.rb +3 -3
data/lib/eiwa/tag/definition.rb +2 -2
data/lib/eiwa/tag/entity.rb +2 -4
data/lib/eiwa/tag/entry.rb +0 -2
data/lib/eiwa/tag/list.rb +18 -0
data/lib/eiwa/tag/meaning.rb +0 -2
data/lib/eiwa/tag/other.rb +5 -3
data/lib/eiwa/tag/reading.rb +0 -2
data/lib/eiwa/tag/reading_meaning.rb +11 -0
data/lib/eiwa/tag/source_language.rb +2 -2
data/lib/eiwa/tag/spelling.rb +0 -2
data/lib/eiwa/version.rb +1 -1
data/lib/eiwa.rb +19 -7
metadata +19 -83
data/lib/eiwa/jmdict_doc.rb +0 -93
data/lib/eiwa/jmdict_entities.rb +0 -178
data/lib/eiwa/parses_jmdict_file.rb +0 -21

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c5888e4802408cc8efdb55ddadfb560ec38d10971fee95b20dd53af2f31f487c
-  data.tar.gz: 93b7101b430ee123a905065f87e5d8ba336e1a18f145c7801daeb2b9b9a5ba72
+  metadata.gz: 87f19acddb018cdf9b99c46b8cd03b38c7ae13d2d006fe31938cf76ebba9da87
+  data.tar.gz: e1049fd3df59e89d3b45998a8bf7fbd0620f29a4ba5908a28df33bf72cbd1bc2
 SHA512:
-  metadata.gz: 64faccd9958b9c359fcd7a7ff40de013bd1060bbf9a59735be4d52c824dd3e6e77abbb81ab42ae037b9175990dd15baa33722503b47d8aaebbb037a8a303f965
-  data.tar.gz: 209efe931acfa8563ea1819f4e9b5a7d07a996d1545458b139de14f82971b27f47e0da726ed5151b3922aa05f6c5809bd3196269e0f6d254cb87e801c81ffe6e
+  metadata.gz: 5900ba9dd6094ca0b376f1a7067df65ddd289b6ef74c9b111ed44a28ac07daf7c32c6b92d898b85fc93278718e1e10fb674a90ae51df3b3303a80d4bf6365de2
+  data.tar.gz: ced4f6719aab797bbb5b1ba251f285751ae15982f36f5c296ebf3e6b607feff0350109a284cc8aee768c2cd248d7aa1f81227567ff549961bc4e37ec4193131b

data/.github/workflows/ruby.yml CHANGED Viewed

@@ -9,12 +9,9 @@ jobs:
     steps:
     - uses: actions/checkout@v1
-    - name: Set up Ruby 2.6
-      uses: actions/setup-ruby@v1
+    - uses: ruby/setup-ruby@v1
       with:
-        ruby-version: 2.6.x
-    - name: Build and test with Rake
-      run: |
-        gem install bundler
-        bundle install --jobs 4 --retry 3
-        bundle exec rake
+        ruby-version: '3.3'
+        bundler-cache: true
+    - name: Run tests
+      run: bundle exec rake

data/.standard.yml ADDED Viewed

	@@ -0,0 +1 @@
1	+ ruby_version: 3.0

data/Gemfile CHANGED Viewed

@@ -2,5 +2,9 @@ source "https://rubygems.org"
 git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
-# Specify your gem's dependencies in eiwa.gemspec
 gemspec
+gem "standard"
+gem "minitest"
+gem "rake"
+gem "m"

data/Gemfile.lock CHANGED Viewed

@@ -1,53 +1,79 @@
 PATH
   remote: .
   specs:
-    eiwa (0.0.2)
-      nokogiri
+    eiwa (0.1.1)
+      nokogiri (~> 1.15.5)
 GEM
   remote: https://rubygems.org/
   specs:
-    ast (2.4.0)
-    coderay (1.1.2)
-    jaro_winkler (1.5.3)
-    method_source (0.9.2)
-    mini_portile2 (2.4.0)
-    minitest (5.11.3)
-    nokogiri (1.10.9)
-      mini_portile2 (~> 2.4.0)
-    parallel (1.17.0)
-    parser (2.6.4.1)
-      ast (~> 2.4.0)
-    pry (0.12.2)
-      coderay (~> 1.1.0)
-      method_source (~> 0.9.0)
-    rainbow (3.0.0)
-    rake (13.0.1)
-    rubocop (0.72.0)
-      jaro_winkler (~> 1.5.1)
+    ast (2.4.2)
+    json (2.7.1)
+    language_server-protocol (3.17.0.3)
+    lint_roller (1.1.0)
+    m (1.6.2)
+      method_source (>= 0.6.7)
+      rake (>= 0.9.2.2)
+    method_source (1.0.0)
+    mini_portile2 (2.8.5)
+    minitest (5.22.2)
+    nokogiri (1.15.5)
+      mini_portile2 (~> 2.8.2)
+      racc (~> 1.4)
+    parallel (1.24.0)
+    parser (3.3.0.5)
+      ast (~> 2.4.1)
+      racc
+    racc (1.7.3)
+    rainbow (3.1.1)
+    rake (13.1.0)
+    regexp_parser (2.9.0)
+    rexml (3.2.6)
+    rubocop (1.62.1)
+      json (~> 2.3)
+      language_server-protocol (>= 3.17.0)
       parallel (~> 1.10)
-      parser (>= 2.6)
+      parser (>= 3.3.0.2)
       rainbow (>= 2.2.2, < 4.0)
+      regexp_parser (>= 1.8, < 3.0)
+      rexml (>= 3.2.5, < 4.0)
+      rubocop-ast (>= 1.31.1, < 2.0)
       ruby-progressbar (~> 1.7)
-      unicode-display_width (>= 1.4.0, < 1.7)
-    rubocop-performance (1.4.1)
-      rubocop (>= 0.71.0)
-    ruby-progressbar (1.10.1)
-    standard (0.1.4)
-      rubocop (~> 0.72.0)
-      rubocop-performance (~> 1.4.0)
-    unicode-display_width (1.6.0)
+      unicode-display_width (>= 2.4.0, < 3.0)
+    rubocop-ast (1.31.2)
+      parser (>= 3.3.0.4)
+    rubocop-performance (1.20.2)
+      rubocop (>= 1.48.1, < 2.0)
+      rubocop-ast (>= 1.30.0, < 2.0)
+    ruby-progressbar (1.13.0)
+    standard (1.34.0)
+      language_server-protocol (~> 3.17.0.2)
+      lint_roller (~> 1.0)
+      rubocop (~> 1.60)
+      standard-custom (~> 1.0.0)
+      standard-performance (~> 1.3)
+    standard-custom (1.0.2)
+      lint_roller (~> 1.0)
+      rubocop (~> 1.50)
+    standard-performance (1.3.1)
+      lint_roller (~> 1.1)
+      rubocop-performance (~> 1.20.2)
+    unicode-display_width (2.5.0)
 PLATFORMS
-  ruby
+  aarch64-linux
+  arm-linux
+  arm64-darwin
+  x86-linux
+  x86_64-darwin
+  x86_64-linux
 DEPENDENCIES
-  bundler (~> 1.17)
   eiwa!
-  minitest (~> 5.0)
-  pry
-  rake (~> 13.0)
+  m
+  minitest
+  rake
   standard
 BUNDLED WITH
-   1.17.3
+   2.5.4

data/README.md CHANGED Viewed

@@ -1,7 +1,12 @@
 # eiwa / 英和
-Parses the Japanese-English version of JMDict, a daily export of the WWWJDIC
-online Japanese dictionary.
+Parses two types of Japanese-English dictionaries:
+* `:jmdict_e` - [JMDict](http://www.edrdg.org/jmdict/edict_doc.html)'s
+  English-only export of the WWWJDIC online Japanese dictionary.
+* `:kanjidic2` - the
+  [KANJIDIC2](http://www.edrdg.org/wiki/index.php/KANJIDIC_Project) dictionary
+  of roughly 13,000 kanji characters
 ## Usage
@@ -23,15 +28,24 @@ gem 'eiwa'
 Get your hands on a supported dictionary. Right now eiwa only parses
 [JMDict](http://www.edrdg.org/jmdict/j_jmdict.html), which can be fetched from
-the [Monash ftp site](http://ftp.monash.edu/pub/nihongo/00INDEX.html) or with a
+the [EDRDG ftp site](http://ftp.edrdg.org/pub/Nihongo/00INDEX.html) or with a
 script like this, for the Japanese-English export:
 ```bash
-curl http://ftp.monash.edu/pub/nihongo/JMdict_e -o jmdict.xml
+# Download JMDICT-E:
+$ curl http://ftp.edrdg.org/pub/Nihongo/JMdict_e.gz -o jmdict.xml.gz"
+# Unzip to jmdict.xml
+$ gunzip jmdict.xml.gz
+# Download KANJIDIC2:
+$ curl http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -o kanjidic2.xml.gz
+# Unzip to kanjidic2.xml
+$ gunzip kanjidic2.xml.gz
 ```
-This file is updated daily, and is essentially an export of all vocabulary on
-the [WWWJDIC application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
+These files are updated daily, and are essentially an export of all vocabulary
+and kanji in the [WWWJDIC
+application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
 ### Parse the dictionary
@@ -44,13 +58,11 @@ array and one that will invoke a provided block with each entry, but which won't
 retain a reference to the entries, allowing Ruby to garbage collect them as it
 goes.
-Parsing the dictionary is CPU intensive, and takes about 13 seconds on my 2019
-13" MacBook Pro.
 #### Passing a block
 If you just want to do some processing on each entry, it probably makes sense to
-invoke the library by passing a block
+invoke the library by passing a block (note that supported types include only
+`:jmdict_e` and `:kanjidic2`)
 ```ruby
 Eiwa.parse_file("path/to/some.xml", type: :jmdict_e) do |entry|
@@ -74,6 +86,3 @@ entries = Eiwa.parse_file("path/to/some.xml", type: :jmdict_e)
 Note that for the abridged Japanese-English dictionary, this will consume about
 500MB of RAM.
-### The entry object model
-I haven't documented the [Entry](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag/entry.rb) type or its child types yet, but they should be pretty easy to piece together by inspecting the output and [checking the source listings](https://github.com/searls/eiwa/blob/master/lib/eiwa/tag).

data/eiwa.gemspec CHANGED Viewed

@@ -19,10 +19,5 @@ Gem::Specification.new do |spec|
   spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
-  spec.add_dependency "nokogiri"
-  spec.add_development_dependency "bundler", "~> 1.17"
-  spec.add_development_dependency "rake", "~> 13.0"
-  spec.add_development_dependency "minitest", "~> 5.0"
-  spec.add_development_dependency "standard"
-  spec.add_development_dependency "pry"
+  spec.add_dependency "nokogiri", "~> 1.15.5"
 end

data/lib/eiwa/jmdict/doc.rb ADDED Viewed

@@ -0,0 +1,85 @@
+require_relative "entities"
+module Eiwa
+  module Jmdict
+    TAGS = {
+      "entry" => Tag::Entry,
+      "k_ele" => Tag::Spelling,
+      "r_ele" => Tag::Reading,
+      "sense" => Tag::Meaning,
+      "pos" => Tag::Entity,
+      "misc" => Tag::Entity,
+      "dial" => Tag::Entity,
+      "field" => Tag::Entity,
+      "ke_inf" => Tag::Entity,
+      "re_inf" => Tag::Entity,
+      "xref" => Tag::CrossReference,
+      "ant" => Tag::Antonym,
+      "lsource" => Tag::SourceLanguage,
+      "gloss" => Tag::Definition
+    }
+    class Doc < Nokogiri::XML::SAX::Document
+      def initialize(each_entry_block)
+        @each_entry_block = each_entry_block
+        @current = nil
+      end
+      def start_document
+      end
+      def end_document
+      end
+      def start_element(name, attrs)
+        parent = @current
+        @current = (TAGS[name] || Tag::Other).new
+        @current.start(name, attrs, parent)
+      end
+      def end_element(name)
+        raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
+        ending = @current
+        ending.end_self
+        if ending.is_a?(Tag::Entry)
+          @each_entry_block&.call(ending)
+        end
+        @current = ending.parent
+        @current&.end_child(ending)
+      end
+      def characters(s)
+        @current.add_characters(s)
+      end
+      # def comment string
+      #   puts "comment #{string}"
+      # end
+      # def warning string
+      #   puts "warning #{string}"
+      # end
+      def error(msg)
+        if (matches = msg.match(/Entity '(\S+)' not defined/))
+          # See: http://github.com/sparklemotion/nokogiri/issues/1926
+          code = matches[1]
+          @current.set_entity(code, ENTITIES[code])
+        elsif msg == "Detected an entity reference loop\n"
+          # Do nothing and hope this does not matter.
+        else
+          raise Eiwa::Error.new("Parsing error: #{msg}")
+        end
+      end
+      # def cdata_block string
+      #   puts "cdata_block #{string}"
+      # end
+      # def processing_instruction name, content
+      #   puts "processing_instruction #{name}, #{content}"
+      # end
+    end
+  end
+end

data/lib/eiwa/jmdict/entities.rb ADDED Viewed

@@ -0,0 +1,180 @@
+module Eiwa
+  module Jmdict
+    ENTITIES = {
+      "Buddh" => "Buddhist term",
+      "MA" => "martial arts term",
+      "Shinto" => "Shinto term",
+      "X" => "rude or X-rated term (not displayed in educational software)",
+      "abbr" => "abbreviation",
+      "adj-f" => "noun or verb acting prenominally",
+      "adj-i" => "adjective (keiyoushi)",
+      "adj-ix" => "adjective (keiyoushi) - yoi/ii class",
+      "adj-kari" => "`kari' adjective (archaic)",
+      "adj-ku" => "`ku' adjective (archaic)",
+      "adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
+      "adj-nari" => "archaic/formal form of na-adjective",
+      "adj-no" => "nouns which may take the genitive case particle `no'",
+      "adj-pn" => "pre-noun adjectival (rentaishi)",
+      "adj-shiku" => "`shiku' adjective (archaic)",
+      "adj-t" => "`taru' adjective",
+      "adv" => "adverb (fukushi)",
+      "adv-to" => "adverb taking the `to' particle",
+      "anat" => "anatomical term",
+      "arch" => "archaism",
+      "archit" => "architecture term",
+      "astron" => "astronomy, etc. term",
+      "ateji" => "ateji (phonetic) reading",
+      "aux" => "auxiliary",
+      "aux-adj" => "auxiliary adjective",
+      "aux-v" => "auxiliary verb",
+      "baseb" => "baseball term",
+      "biol" => "biology term",
+      "bot" => "botany term",
+      "bus" => "business term",
+      "chem" => "chemistry term",
+      "chn" => "children's language",
+      "col" => "colloquialism",
+      "comp" => "computer terminology",
+      "conj" => "conjunction",
+      "cop" => "copula",
+      "cop-da" => "copula",
+      "ctr" => "counter",
+      "derog" => "derogatory",
+      "eK" => "exclusively kanji",
+      "econ" => "economics term",
+      "ek" => "exclusively kana",
+      "engr" => "engineering term",
+      "exp" => "expressions (phrases, clauses, etc.)",
+      "fam" => "familiar language",
+      "fem" => "female term or language",
+      "finc" => "finance term",
+      "food" => "food term",
+      "geol" => "geology, etc. term",
+      "geom" => "geometry term",
+      "gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
+      "hob" => "Hokkaido-ben",
+      "hon" => "honorific or respectful (sonkeigo) language",
+      "hum" => "humble (kenjougo) language",
+      "iK" => "word containing irregular kanji usage",
+      "id" => "idiomatic expression",
+      "ik" => "word containing irregular kana usage",
+      "int" => "interjection (kandoushi)",
+      "io" => "irregular okurigana usage",
+      "iv" => "irregular verb",
+      "joc" => "jocular, humorous term",
+      "ksb" => "Kansai-ben",
+      "ktb" => "Kantou-ben",
+      "kyb" => "Kyoto-ben",
+      "kyu" => "Kyuushuu-ben",
+      "law" => "law, etc. term",
+      "ling" => "linguistics terminology",
+      "m-sl" => "manga slang",
+      "mahj" => "mahjong term",
+      "male" => "male term or language",
+      "male-sl" => "male slang",
+      "math" => "mathematics",
+      "med" => "medicine, etc. term",
+      "mil" => "military",
+      "music" => "music term",
+      "n" => "noun (common) (futsuumeishi)",
+      "n-adv" => "adverbial noun (fukushitekimeishi)",
+      "n-pr" => "proper noun",
+      "n-pref" => "noun, used as a prefix",
+      "n-suf" => "noun, used as a suffix",
+      "n-t" => "noun (temporal) (jisoumeishi)",
+      "nab" => "Nagano-ben",
+      "num" => "numeric",
+      "oK" => "word containing out-dated kanji",
+      "obs" => "obsolete term",
+      "obsc" => "obscure term",
+      "oik" => "old or irregular kana form",
+      "ok" => "out-dated or obsolete kana usage",
+      "on-mim" => "onomatopoeic or mimetic word",
+      "osb" => "Osaka-ben",
+      "physics" => "physics terminology",
+      "pn" => "pronoun",
+      "poet" => "poetical term",
+      "pol" => "polite (teineigo) language",
+      "pref" => "prefix",
+      "proverb" => "proverb",
+      "prt" => "particle",
+      "quote" => "quotation",
+      "rare" => "rare",
+      "rkb" => "Ryuukyuu-ben",
+      "sens" => "sensitive",
+      "shogi" => "shogi term",
+      "sl" => "slang",
+      "sports" => "sports term",
+      "suf" => "suffix",
+      "sumo" => "sumo term",
+      "thb" => "Touhoku-ben",
+      "tsb" => "Tosa-ben",
+      "tsug" => "Tsugaru-ben",
+      "uK" => "word usually written using kanji alone",
+      "uk" => "word usually written using kana alone",
+      "unc" => "unclassified",
+      "v-unspec" => "verb unspecified",
+      "v1" => "Ichidan verb",
+      "v1-s" => "Ichidan verb - kureru special class",
+      "v2a-s" => "Nidan verb with 'u' ending (archaic)",
+      "v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
+      "v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
+      "v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
+      "v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
+      "v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
+      "v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
+      "v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
+      "v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
+      "v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
+      "v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
+      "v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
+      "v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
+      "v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
+      "v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
+      "v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
+      "v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
+      "v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
+      "v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
+      "v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
+      "v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
+      "v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
+      "v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
+      "v4b" => "Yodan verb with `bu' ending (archaic)",
+      "v4g" => "Yodan verb with `gu' ending (archaic)",
+      "v4h" => "Yodan verb with `hu/fu' ending (archaic)",
+      "v4k" => "Yodan verb with `ku' ending (archaic)",
+      "v4m" => "Yodan verb with `mu' ending (archaic)",
+      "v4n" => "Yodan verb with `nu' ending (archaic)",
+      "v4r" => "Yodan verb with `ru' ending (archaic)",
+      "v4s" => "Yodan verb with `su' ending (archaic)",
+      "v4t" => "Yodan verb with `tsu' ending (archaic)",
+      "v5aru" => "Godan verb - -aru special class",
+      "v5b" => "Godan verb with `bu' ending",
+      "v5g" => "Godan verb with `gu' ending",
+      "v5k" => "Godan verb with `ku' ending",
+      "v5k-s" => "Godan verb - Iku/Yuku special class",
+      "v5m" => "Godan verb with `mu' ending",
+      "v5n" => "Godan verb with `nu' ending",
+      "v5r" => "Godan verb with `ru' ending",
+      "v5r-i" => "Godan verb with `ru' ending (irregular verb)",
+      "v5s" => "Godan verb with `su' ending",
+      "v5t" => "Godan verb with `tsu' ending",
+      "v5u" => "Godan verb with `u' ending",
+      "v5u-s" => "Godan verb with `u' ending (special class)",
+      "v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
+      "vi" => "intransitive verb",
+      "vk" => "Kuru verb - special class",
+      "vn" => "irregular nu verb",
+      "vr" => "irregular ru verb, plain form ends with -ri",
+      "vs" => "noun or participle which takes the aux. verb suru",
+      "vs-c" => "su verb - precursor to the modern suru",
+      "vs-i" => "suru verb - included",
+      "vs-s" => "suru verb - special class",
+      "vt" => "transitive verb",
+      "vulg" => "vulgar expression or word",
+      "vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
+      "yoji" => "yojijukugo",
+      "zool" => "zoology term"
+    }
+  end
+end

data/lib/eiwa/kanjidic/doc.rb ADDED Viewed

@@ -0,0 +1,43 @@
+module Eiwa
+  module Kanjidic
+    TAGS = {
+      "character" => Tag::Character,
+      "misc" => Tag::Bag,
+      "reading_meaning" => Tag::ReadingMeaning,
+      "rmgroup" => Tag::List
+    }
+    class Doc < Nokogiri::XML::SAX::Document
+      def initialize(each_entry_block)
+        @each_entry_block = each_entry_block
+        @current = nil
+      end
+      def start_element(name, attrs)
+        parent = @current
+        @current = (TAGS[name] || Tag::Other).new
+        @current.start(name, attrs, parent)
+      end
+      def end_element(name)
+        raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
+        ending = @current
+        ending.end_self
+        if ending.is_a?(Tag::Character)
+          @each_entry_block&.call(ending)
+        end
+        @current = ending.parent
+        @current&.end_child(ending)
+      end
+      def characters(s)
+        @current.add_characters(s)
+      end
+      def error(msg)
+        raise Eiwa::Error.new("Parsing error: #{msg}")
+      end
+    end
+  end
+end

data/lib/eiwa/parses_file.rb ADDED Viewed

@@ -0,0 +1,35 @@
+require "nokogiri"
+require_relative "jmdict/doc"
+require_relative "kanjidic/doc"
+module Eiwa
+  class ParsesFile
+    def call(filename, type, each_entry_block)
+      if each_entry_block.nil?
+        entries = []
+        each_entry_block ||= ->(e) { entries << e }
+      end
+      doc_for(type).new(each_entry_block).tap do |doc|
+        Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
+          ctx.recovery = true
+        end
+      end
+      entries
+    end
+    private
+    def doc_for(type)
+      case type
+      when :jmdict_e
+        Jmdict::Doc
+      when :kanjidic2
+        Kanjidic::Doc
+      else
+        raise Eiwa::Error.new("Unknown file type: #{type}")
+      end
+    end
+  end
+end

data/lib/eiwa/tag/antonym.rb CHANGED Viewed

@@ -18,10 +18,10 @@ module Eiwa
         @text == other.text &&
           @sense_ordinal == other.sense_ordinal
       end
-      alias == eql?
+      alias_method :==, :eql?
       def hash
-        @text.hash + @sense_ordinal.hash
+        [@text, @sense_ordinal].hash
       end
     end
   end

data/lib/eiwa/tag/any.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module Eiwa
       def start(tag_name, attrs, parent)
         @tag_name = tag_name
-        @attrs = Hash[attrs]
+        @attrs = attrs.to_h
         @parent = parent
       end

data/lib/eiwa/tag/bag.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module Eiwa
+  module Tag
+    # For simple elements that contain child element_name, value pairs that could plop into a hash nicely
+    class Bag < Any
+      attr_reader :values
+      def initialize
+        @values = {}
+      end
+      def [](key)
+        @values[key]
+      end
+      def end_child(child)
+        # Don't overwrite, first dupe tends to be authorative one
+        @values[child.tag_name] = child.text unless @values.key?(child.tag_name)
+      end
+    end
+  end
+end

data/lib/eiwa/tag/character.rb ADDED Viewed

@@ -0,0 +1,24 @@
+module Eiwa
+  module Tag
+    class Character < Any
+      attr_reader :text,
+        :grade, :stroke_count, :freq, :jlpt,
+        :onyomi, :kunyomi, :meanings
+      def end_child(child)
+        if child.tag_name == "literal"
+          @text = child.text
+        elsif child.tag_name == "reading_meaning"
+          @onyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_on" }.map(&:text)
+          @kunyomi = child.rmgroup.items.select { |item| item.name == "reading" && item.attrs["r_type"] == "ja_kun" }.map(&:text)
+          @meanings = child.rmgroup.items.select { |item| item.name == "meaning" && (item.attrs["m_lang"].nil? || item.attrs["m_lang"] == "en") }.map(&:text)
+        elsif child.tag_name == "misc"
+          @grade = child["grade"]&.to_i
+          @stroke_count = child["stroke_count"]&.to_i
+          @freq = child["freq"]&.to_i
+          @jlpt = child["jlpt"]&.to_i
+        end
+      end
+    end
+  end
+end