eiwa 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: e88ea7a5b39e145c4e5115b1e1f0505fcb1f9dd63f1f68c54de89903dabaef67
4
+ data.tar.gz: 2da703a8cd7daa9c4ade8a5d9090f814bfc8adcdd62fd3dcb381b3aba2bc089c
5
+ SHA512:
6
+ metadata.gz: 561cfcb808e8cfa05f225c92edfa861eea3b3173141ba133ff00c988b0071c9f162ed0ccd321532b7de36bf5797f6b0b00c408cd919d8657e844612c2c5450cf
7
+ data.tar.gz: a85484fe894a30e3c6f094332a1ae143cb0af95aec164d470aaf87c9614e86dd9e61fd6b8ed986767afd863afb07f830eaf9160deeca6e76b5edc03240c20232
@@ -0,0 +1,20 @@
1
+ name: Ruby
2
+
3
+ on: [push]
4
+
5
+ jobs:
6
+ build:
7
+
8
+ runs-on: ubuntu-latest
9
+
10
+ steps:
11
+ - uses: actions/checkout@v1
12
+ - name: Set up Ruby 2.6
13
+ uses: actions/setup-ruby@v1
14
+ with:
15
+ ruby-version: 2.6.x
16
+ - name: Build and test with Rake
17
+ run: |
18
+ gem install bundler
19
+ bundle install --jobs 4 --retry 3
20
+ bundle exec rake
@@ -0,0 +1,8 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in eiwa.gemspec
6
+ gemspec
@@ -0,0 +1,53 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ eiwa (0.0.1)
5
+ nokogiri
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ ast (2.4.0)
11
+ coderay (1.1.2)
12
+ jaro_winkler (1.5.3)
13
+ method_source (0.9.2)
14
+ mini_portile2 (2.4.0)
15
+ minitest (5.11.3)
16
+ nokogiri (1.10.4)
17
+ mini_portile2 (~> 2.4.0)
18
+ parallel (1.17.0)
19
+ parser (2.6.4.1)
20
+ ast (~> 2.4.0)
21
+ pry (0.12.2)
22
+ coderay (~> 1.1.0)
23
+ method_source (~> 0.9.0)
24
+ rainbow (3.0.0)
25
+ rake (10.5.0)
26
+ rubocop (0.72.0)
27
+ jaro_winkler (~> 1.5.1)
28
+ parallel (~> 1.10)
29
+ parser (>= 2.6)
30
+ rainbow (>= 2.2.2, < 4.0)
31
+ ruby-progressbar (~> 1.7)
32
+ unicode-display_width (>= 1.4.0, < 1.7)
33
+ rubocop-performance (1.4.1)
34
+ rubocop (>= 0.71.0)
35
+ ruby-progressbar (1.10.1)
36
+ standard (0.1.4)
37
+ rubocop (~> 0.72.0)
38
+ rubocop-performance (~> 1.4.0)
39
+ unicode-display_width (1.6.0)
40
+
41
+ PLATFORMS
42
+ ruby
43
+
44
+ DEPENDENCIES
45
+ bundler (~> 1.17)
46
+ eiwa!
47
+ minitest (~> 5.0)
48
+ pry
49
+ rake (~> 10.0)
50
+ standard
51
+
52
+ BUNDLED WITH
53
+ 1.17.3
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 Justin Searls
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,77 @@
1
+ # eiwa / 英和
2
+
3
+ Parses the Japanese-English version of JMDict, a daily export of the WWWJDIC
4
+ online Japanese dictionary.
5
+
6
+ ## Usage
7
+
8
+ ### Install
9
+
10
+ Install the gem:
11
+
12
+ ```
13
+ gem install eiwa
14
+ ```
15
+
16
+ Or add it to your `Gemfile`:
17
+
18
+ ```ruby
19
+ gem 'eiwa'
20
+ ```
21
+
22
+ ### Download a supported dictionary
23
+
24
+ Get your hands on a supported dictionary. Right now eiwa only parses
25
+ [JMDict](http://www.edrdg.org/jmdict/j_jmdict.html), which can be fetched from
26
+ the [Monash ftp site](http://ftp.monash.edu/pub/nihongo/00INDEX.html) or with a
27
+ script like this, for the Japanese-English export:
28
+
29
+ ```bash
30
+ curl http://ftp.monash.edu/pub/nihongo/JMdict_e -o jmdict.xml
31
+ ```
32
+
33
+ This file is updated daily, and is essentially an export of all vocabulary on
34
+ the [WWWJDIC application](http://nihongo.monash.edu/cgi-bin/wwwjdic?1C)
35
+
36
+ ### Parse the dictionary
37
+
38
+ The eiwa gem implements an evented [SAX
39
+ parser](https://en.wikipedia.org/wiki/Simple_API_for_XML) via nokogiri to
40
+ efficiently work through the very large XML file, as loading a full DOM into
41
+ memory is very resource-intensive. In consideration of this, eiwa's parsing
42
+ method provides two modes, one that will return every dictionary entry in an
43
+ array and one that will invoke a provided block with each entry, but which won't
44
+ retain a reference to the entries, allowing Ruby to garbage collect them as it
45
+ goes.
46
+
47
+ Parsing the dictionary is CPU intensive, and takes about 13 seconds on my 2019
48
+ 13" MacBook Pro.
49
+
50
+ #### Passing a block
51
+
52
+ If you just want to do some processing on each entry, it probably makes sense to
53
+ invoke the library by passing a block
54
+
55
+ ```ruby
56
+ Eiwa.parse_file("path/to/some.xml", type: :jmdict_e) do |entry|
57
+ # Do something with that entry
58
+ end
59
+ ```
60
+
61
+ This approach can parse the entire JMDICT-E dictionary in a 15MB Ruby 2.6
62
+ process.
63
+
64
+ #### Return the results in an array
65
+
66
+ If you're just going to add all the entries to an array or otherwise retain them
67
+ in memory, you can call the same method without a block, and it will return all
68
+ the entries in an array.
69
+
70
+ ```ruby
71
+ entries = Eiwa.parse_file("path/to/some.xml", type: :jmdict_e)
72
+ ```
73
+
74
+ Note that for the abridged Japanese-English dictionary, this will consume about
75
+ 500MB of RAM.
76
+
77
+
@@ -0,0 +1,11 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+ require "standard/rake"
4
+
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.libs << "lib"
8
+ t.test_files = FileList["test/**/*.rb"]
9
+ end
10
+
11
+ task default: [:test, "standard:fix"]
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "eiwa"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,28 @@
1
+ lib = File.expand_path("../lib", __FILE__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require "eiwa/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "eiwa"
7
+ spec.version = Eiwa::VERSION
8
+ spec.authors = ["Justin Searls"]
9
+ spec.email = ["searls@gmail.com"]
10
+
11
+ spec.summary = "Parses the JMDict Japanese-English dictionary"
12
+ spec.homepage = "https://github.com/searls/eiwa"
13
+ spec.license = "MIT"
14
+
15
+ spec.files = Dir.chdir(File.expand_path("..", __FILE__)) do
16
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|tmp)/}) }
17
+ end
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency "nokogiri"
23
+ spec.add_development_dependency "bundler", "~> 1.17"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+ spec.add_development_dependency "minitest", "~> 5.0"
26
+ spec.add_development_dependency "standard"
27
+ spec.add_development_dependency "pry"
28
+ end
@@ -0,0 +1,15 @@
1
+ require "eiwa/version"
2
+ require "eiwa/parses_jmdict_file"
3
+
4
+ module Eiwa
5
+ class Error < StandardError; end
6
+
7
+ def self.parse_file(filename, type: :jmdict_e, &each_entry_block)
8
+ case type
9
+ when :jmdict_e
10
+ ParsesJmdictFile.new.call(filename, each_entry_block)
11
+ else
12
+ raise Eiwa::Error.new("Unknown file type: #{type}")
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,93 @@
1
+ require_relative "tag/entry"
2
+ require_relative "tag/spelling"
3
+ require_relative "tag/reading"
4
+ require_relative "tag/meaning"
5
+ require_relative "tag/entity"
6
+ require_relative "tag/cross_reference"
7
+ require_relative "tag/antonym"
8
+ require_relative "tag/source_language"
9
+ require_relative "tag/definition"
10
+ require_relative "tag/other"
11
+
12
+ require_relative "jmdict_entities"
13
+
14
+ module Eiwa
15
+ TAGS = {
16
+ "entry" => Tag::Entry,
17
+ "k_ele" => Tag::Spelling,
18
+ "r_ele" => Tag::Reading,
19
+ "sense" => Tag::Meaning,
20
+ "pos" => Tag::Entity,
21
+ "misc" => Tag::Entity,
22
+ "dial" => Tag::Entity,
23
+ "field" => Tag::Entity,
24
+ "ke_inf" => Tag::Entity,
25
+ "re_inf" => Tag::Entity,
26
+ "xref" => Tag::CrossReference,
27
+ "ant" => Tag::Antonym,
28
+ "lsource" => Tag::SourceLanguage,
29
+ "gloss" => Tag::Definition,
30
+ }
31
+
32
+ class JmdictDoc < Nokogiri::XML::SAX::Document
33
+ def initialize(each_entry_block)
34
+ @each_entry_block = each_entry_block
35
+ end
36
+
37
+ def start_document
38
+ end
39
+
40
+ def end_document
41
+ end
42
+
43
+ def start_element(name, attrs)
44
+ parent = @current
45
+ @current = (TAGS[name] || Tag::Other).new
46
+ @current.start(name, attrs, parent)
47
+ end
48
+
49
+ def end_element(name)
50
+ raise Eiwa::Error.new("Parsing error. Expected <#{@current.tag_name}> to close before <#{name}>") if @current.tag_name != name
51
+ ending = @current
52
+ ending.end_self
53
+ if ending.is_a?(Tag::Entry)
54
+ @each_entry_block&.call(ending)
55
+ end
56
+
57
+ @current = ending.parent
58
+ @current&.end_child(ending)
59
+ end
60
+
61
+ def characters(s)
62
+ @current.add_characters(s)
63
+ end
64
+
65
+ # def comment string
66
+ # puts "comment #{string}"
67
+ # end
68
+
69
+ # def warning string
70
+ # puts "warning #{string}"
71
+ # end
72
+
73
+ def error(msg)
74
+ if (matches = msg.match(/Entity '([\S]+)' not defined/))
75
+ # See: http://github.com/sparklemotion/nokogiri/issues/1926
76
+ code = matches[1]
77
+ @current.set_entity(code, JMDICT_ENTITIES[code])
78
+ elsif msg == "Detected an entity reference loop\n"
79
+ # Do nothing and hope this does not matter.
80
+ else
81
+ raise Eiwa::Error.new("Parsing error: #{msg}")
82
+ end
83
+ end
84
+
85
+ # def cdata_block string
86
+ # puts "cdata_block #{string}"
87
+ # end
88
+
89
+ # def processing_instruction name, content
90
+ # puts "processing_instruction #{name}, #{content}"
91
+ # end
92
+ end
93
+ end
@@ -0,0 +1,177 @@
1
+ module Eiwa
2
+ JMDICT_ENTITIES = {
3
+ "MA" => "martial arts term",
4
+ "X" => "rude or X-rated term (not displayed in educational software)",
5
+ "abbr" => "abbreviation",
6
+ "adj-i" => "adjective (keiyoushi)",
7
+ "adj-ix" => "adjective (keiyoushi) - yoi/ii class",
8
+ "adj-na" => "adjectival nouns or quasi-adjectives (keiyodoshi)",
9
+ "adj-no" => "nouns which may take the genitive case particle `no'",
10
+ "adj-pn" => "pre-noun adjectival (rentaishi)",
11
+ "adj-t" => "`taru' adjective",
12
+ "adj-f" => "noun or verb acting prenominally",
13
+ "adv" => "adverb (fukushi)",
14
+ "adv-to" => "adverb taking the `to' particle",
15
+ "arch" => "archaism",
16
+ "ateji" => "ateji (phonetic) reading",
17
+ "aux" => "auxiliary",
18
+ "aux-v" => "auxiliary verb",
19
+ "aux-adj" => "auxiliary adjective",
20
+ "Buddh" => "Buddhist term",
21
+ "chem" => "chemistry term",
22
+ "chn" => "children's language",
23
+ "col" => "colloquialism",
24
+ "comp" => "computer terminology",
25
+ "conj" => "conjunction",
26
+ "cop-da" => "copula",
27
+ "ctr" => "counter",
28
+ "derog" => "derogatory",
29
+ "eK" => "exclusively kanji",
30
+ "ek" => "exclusively kana",
31
+ "exp" => "expressions (phrases, clauses, etc.)",
32
+ "fam" => "familiar language",
33
+ "fem" => "female term or language",
34
+ "food" => "food term",
35
+ "geom" => "geometry term",
36
+ "gikun" => "gikun (meaning as reading) or jukujikun (special kanji reading)",
37
+ "hon" => "honorific or respectful (sonkeigo) language",
38
+ "hum" => "humble (kenjougo) language",
39
+ "iK" => "word containing irregular kanji usage",
40
+ "id" => "idiomatic expression",
41
+ "ik" => "word containing irregular kana usage",
42
+ "int" => "interjection (kandoushi)",
43
+ "io" => "irregular okurigana usage",
44
+ "iv" => "irregular verb",
45
+ "ling" => "linguistics terminology",
46
+ "m-sl" => "manga slang",
47
+ "male" => "male term or language",
48
+ "male-sl" => "male slang",
49
+ "math" => "mathematics",
50
+ "mil" => "military",
51
+ "n" => "noun (common) (futsuumeishi)",
52
+ "n-adv" => "adverbial noun (fukushitekimeishi)",
53
+ "n-suf" => "noun, used as a suffix",
54
+ "n-pref" => "noun, used as a prefix",
55
+ "n-t" => "noun (temporal) (jisoumeishi)",
56
+ "num" => "numeric",
57
+ "oK" => "word containing out-dated kanji",
58
+ "obs" => "obsolete term",
59
+ "obsc" => "obscure term",
60
+ "ok" => "out-dated or obsolete kana usage",
61
+ "oik" => "old or irregular kana form",
62
+ "on-mim" => "onomatopoeic or mimetic word",
63
+ "pn" => "pronoun",
64
+ "poet" => "poetical term",
65
+ "pol" => "polite (teineigo) language",
66
+ "pref" => "prefix",
67
+ "proverb" => "proverb",
68
+ "prt" => "particle",
69
+ "physics" => "physics terminology",
70
+ "quote" => "quotation",
71
+ "rare" => "rare",
72
+ "sens" => "sensitive",
73
+ "sl" => "slang",
74
+ "suf" => "suffix",
75
+ "uK" => "word usually written using kanji alone",
76
+ "uk" => "word usually written using kana alone",
77
+ "unc" => "unclassified",
78
+ "yoji" => "yojijukugo",
79
+ "v1" => "Ichidan verb",
80
+ "v1-s" => "Ichidan verb - kureru special class",
81
+ "v2a-s" => "Nidan verb with 'u' ending (archaic)",
82
+ "v4h" => "Yodan verb with `hu/fu' ending (archaic)",
83
+ "v4r" => "Yodan verb with `ru' ending (archaic)",
84
+ "v5aru" => "Godan verb - -aru special class",
85
+ "v5b" => "Godan verb with `bu' ending",
86
+ "v5g" => "Godan verb with `gu' ending",
87
+ "v5k" => "Godan verb with `ku' ending",
88
+ "v5k-s" => "Godan verb - Iku/Yuku special class",
89
+ "v5m" => "Godan verb with `mu' ending",
90
+ "v5n" => "Godan verb with `nu' ending",
91
+ "v5r" => "Godan verb with `ru' ending",
92
+ "v5r-i" => "Godan verb with `ru' ending (irregular verb)",
93
+ "v5s" => "Godan verb with `su' ending",
94
+ "v5t" => "Godan verb with `tsu' ending",
95
+ "v5u" => "Godan verb with `u' ending",
96
+ "v5u-s" => "Godan verb with `u' ending (special class)",
97
+ "v5uru" => "Godan verb - Uru old class verb (old form of Eru)",
98
+ "vz" => "Ichidan verb - zuru verb (alternative form of -jiru verbs)",
99
+ "vi" => "intransitive verb",
100
+ "vk" => "Kuru verb - special class",
101
+ "vn" => "irregular nu verb",
102
+ "vr" => "irregular ru verb, plain form ends with -ri",
103
+ "vs" => "noun or participle which takes the aux. verb suru",
104
+ "vs-c" => "su verb - precursor to the modern suru",
105
+ "vs-s" => "suru verb - special class",
106
+ "vs-i" => "suru verb - included",
107
+ "kyb" => "Kyoto-ben",
108
+ "osb" => "Osaka-ben",
109
+ "ksb" => "Kansai-ben",
110
+ "ktb" => "Kantou-ben",
111
+ "tsb" => "Tosa-ben",
112
+ "thb" => "Touhoku-ben",
113
+ "tsug" => "Tsugaru-ben",
114
+ "kyu" => "Kyuushuu-ben",
115
+ "rkb" => "Ryuukyuu-ben",
116
+ "nab" => "Nagano-ben",
117
+ "hob" => "Hokkaido-ben",
118
+ "vt" => "transitive verb",
119
+ "vulg" => "vulgar expression or word",
120
+ "adj-kari" => "`kari' adjective (archaic)",
121
+ "adj-ku" => "`ku' adjective (archaic)",
122
+ "adj-shiku" => "`shiku' adjective (archaic)",
123
+ "adj-nari" => "archaic/formal form of na-adjective",
124
+ "n-pr" => "proper noun",
125
+ "v-unspec" => "verb unspecified",
126
+ "v4k" => "Yodan verb with `ku' ending (archaic)",
127
+ "v4g" => "Yodan verb with `gu' ending (archaic)",
128
+ "v4s" => "Yodan verb with `su' ending (archaic)",
129
+ "v4t" => "Yodan verb with `tsu' ending (archaic)",
130
+ "v4n" => "Yodan verb with `nu' ending (archaic)",
131
+ "v4b" => "Yodan verb with `bu' ending (archaic)",
132
+ "v4m" => "Yodan verb with `mu' ending (archaic)",
133
+ "v2k-k" => "Nidan verb (upper class) with `ku' ending (archaic)",
134
+ "v2g-k" => "Nidan verb (upper class) with `gu' ending (archaic)",
135
+ "v2t-k" => "Nidan verb (upper class) with `tsu' ending (archaic)",
136
+ "v2d-k" => "Nidan verb (upper class) with `dzu' ending (archaic)",
137
+ "v2h-k" => "Nidan verb (upper class) with `hu/fu' ending (archaic)",
138
+ "v2b-k" => "Nidan verb (upper class) with `bu' ending (archaic)",
139
+ "v2m-k" => "Nidan verb (upper class) with `mu' ending (archaic)",
140
+ "v2y-k" => "Nidan verb (upper class) with `yu' ending (archaic)",
141
+ "v2r-k" => "Nidan verb (upper class) with `ru' ending (archaic)",
142
+ "v2k-s" => "Nidan verb (lower class) with `ku' ending (archaic)",
143
+ "v2g-s" => "Nidan verb (lower class) with `gu' ending (archaic)",
144
+ "v2s-s" => "Nidan verb (lower class) with `su' ending (archaic)",
145
+ "v2z-s" => "Nidan verb (lower class) with `zu' ending (archaic)",
146
+ "v2t-s" => "Nidan verb (lower class) with `tsu' ending (archaic)",
147
+ "v2d-s" => "Nidan verb (lower class) with `dzu' ending (archaic)",
148
+ "v2n-s" => "Nidan verb (lower class) with `nu' ending (archaic)",
149
+ "v2h-s" => "Nidan verb (lower class) with `hu/fu' ending (archaic)",
150
+ "v2b-s" => "Nidan verb (lower class) with `bu' ending (archaic)",
151
+ "v2m-s" => "Nidan verb (lower class) with `mu' ending (archaic)",
152
+ "v2y-s" => "Nidan verb (lower class) with `yu' ending (archaic)",
153
+ "v2r-s" => "Nidan verb (lower class) with `ru' ending (archaic)",
154
+ "v2w-s" => "Nidan verb (lower class) with `u' ending and `we' conjugation (archaic)",
155
+ "archit" => "architecture term",
156
+ "astron" => "astronomy, etc. term",
157
+ "baseb" => "baseball term",
158
+ "biol" => "biology term",
159
+ "bot" => "botany term",
160
+ "bus" => "business term",
161
+ "econ" => "economics term",
162
+ "engr" => "engineering term",
163
+ "finc" => "finance term",
164
+ "geol" => "geology, etc. term",
165
+ "law" => "law, etc. term",
166
+ "mahj" => "mahjong term",
167
+ "med" => "medicine, etc. term",
168
+ "music" => "music term",
169
+ "Shinto" => "Shinto term",
170
+ "shogi" => "shogi term",
171
+ "sports" => "sports term",
172
+ "sumo" => "sumo term",
173
+ "zool" => "zoology term",
174
+ "joc" => "jocular, humorous term",
175
+ "anat" => "anatomical term",
176
+ }
177
+ end
@@ -0,0 +1,21 @@
1
+ require "nokogiri"
2
+ require_relative "jmdict_doc"
3
+
4
+ module Eiwa
5
+ class ParsesJmdictFile
6
+ def call(filename, each_entry_block)
7
+ if each_entry_block.nil?
8
+ entries = []
9
+ each_entry_block ||= ->(e) { entries << e }
10
+ end
11
+
12
+ JmdictDoc.new(each_entry_block).tap do |doc|
13
+ Nokogiri::XML::SAX::Parser.new(doc).parse_file(filename) do |ctx|
14
+ ctx.recovery = true
15
+ end
16
+ end
17
+
18
+ entries
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ module Eiwa
2
+ module Tag
3
+ class Antonym < Any
4
+ attr_reader :text, :sense_ordinal
5
+
6
+ def initialize(text: nil, sense_ordinal: nil)
7
+ @text = text
8
+ @sense_ordinal = sense_ordinal
9
+ end
10
+
11
+ def end_self
12
+ parts = @characters.split("・")
13
+ @text = parts[0]
14
+ @sense_ordinal = parts[1]&.to_i
15
+ end
16
+
17
+ def eql?(other)
18
+ @text == other.text &&
19
+ @sense_ordinal == other.sense_ordinal
20
+ end
21
+ alias == eql?
22
+
23
+ def hash
24
+ @text.hash + @sense_ordinal.hash
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,28 @@
1
+ module Eiwa
2
+ module Tag
3
+ class Any
4
+ attr_reader :tag_name, :characters, :parent
5
+
6
+ def start(tag_name, attrs, parent)
7
+ @tag_name = tag_name
8
+ @attrs = Hash[attrs]
9
+ @parent = parent
10
+ end
11
+
12
+ def add_characters(s)
13
+ @characters ||= ""
14
+ @characters << s.chomp
15
+ end
16
+
17
+ def end_child(child)
18
+ end
19
+
20
+ def end_self
21
+ end
22
+
23
+ def to_s
24
+ "<#{@tag_name}>#{@characters}</#{@tag_name}>"
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,31 @@
1
+ module Eiwa
2
+ module Tag
3
+ class CrossReference < Any
4
+ attr_reader :text, :reading, :sense_ordinal
5
+
6
+ def initialize(text: nil, reading: nil, sense_ordinal: nil)
7
+ @text = text
8
+ @reading = reading
9
+ @sense_ordinal = sense_ordinal
10
+ end
11
+
12
+ def end_self
13
+ parts = @characters.split("・")
14
+ @text = parts.first
15
+ @reading = parts[1..-1].find { |part| /[^0-9]/.match(part) }
16
+ @sense_ordinal = parts.find { |part| /^[0-9]+$/.match(part) }&.to_i
17
+ end
18
+
19
+ def eql?(other)
20
+ @text == other.text &&
21
+ @reading == other.reading &&
22
+ @sense_ordinal == other.sense_ordinal
23
+ end
24
+ alias == eql?
25
+
26
+ def hash
27
+ @text.hash + @reading.hash + @sense_ordinal.hash
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,45 @@
1
+ module Eiwa
2
+ module Tag
3
+ class Definition < Any
4
+ attr_reader :text, :language, :gender, :type
5
+
6
+ def initialize(text: nil, language: "eng", gender: nil, type: nil)
7
+ @text = text
8
+ @language = language
9
+ @gender = gender
10
+ @type = type
11
+ end
12
+
13
+ def end_self
14
+ @text = @characters
15
+ @language = @attrs["xml:lang"]
16
+ @gender = @attrs["g_gend"]
17
+ @type = @attrs["g_type"]
18
+ end
19
+
20
+ def literal?
21
+ @type == "lit"
22
+ end
23
+
24
+ def figurative?
25
+ @type == "fig"
26
+ end
27
+
28
+ def explanation?
29
+ @type == "expl"
30
+ end
31
+
32
+ def eql?(other)
33
+ @text == other.text &&
34
+ @language == other.language &&
35
+ @gender == other.gender &&
36
+ @type == other.type
37
+ end
38
+ alias == eql?
39
+
40
+ def hash
41
+ @text.hash + @language.hash + @gender.hash + @type.hash
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,29 @@
1
+ require_relative "any"
2
+
3
+ module Eiwa
4
+ module Tag
5
+ class Entity < Any
6
+ attr_reader :code, :text
7
+
8
+ def initialize(code: nil, text: nil)
9
+ @code = code
10
+ @text = text
11
+ end
12
+
13
+ def set_entity(code, text)
14
+ @code = code
15
+ @text = text
16
+ end
17
+
18
+ def eql?(other)
19
+ @code == other.code &&
20
+ @text == other.text
21
+ end
22
+ alias == eql?
23
+
24
+ def hash
25
+ @code.hash + @text.hash
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,33 @@
1
+ require_relative "any"
2
+
3
+ module Eiwa
4
+ module Tag
5
+ class Entry < Any
6
+ attr_reader :id, :spellings, :readings, :meanings
7
+
8
+ def initialize
9
+ @spellings = []
10
+ @readings = []
11
+ @meanings = []
12
+ end
13
+
14
+ def text
15
+ (@spellings + @readings).first.text
16
+ end
17
+
18
+ def end_child(child)
19
+ case child.tag_name
20
+ when "ent_seq"
21
+ @id = child.characters.to_i
22
+ when "k_ele"
23
+ @spellings << child
24
+ when "r_ele"
25
+ @readings << child
26
+ when "sense"
27
+ child.trickle_down(@meanings.last) unless @meanings.last.nil?
28
+ @meanings << child
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,57 @@
1
+ require_relative "any"
2
+
3
+ module Eiwa
4
+ module Tag
5
+ class Meaning < Any
6
+ attr_reader :parts_of_speech, :definitions, :misc_tags,
7
+ :cross_references, :restricted_to_readings, :restricted_to_spellings,
8
+ :antonyms, :fields, :source_languages, :dialects, :comments
9
+
10
+ def initialize
11
+ @parts_of_speech = []
12
+ @definitions = []
13
+ @misc_tags = []
14
+ @cross_references = []
15
+ @restricted_to_readings = []
16
+ @restricted_to_spellings = []
17
+ @antonyms = []
18
+ @fields = []
19
+ @source_languages = []
20
+ @dialects = []
21
+ @comments = []
22
+ end
23
+
24
+ def end_child(child)
25
+ case child.tag_name
26
+ when "pos"
27
+ @parts_of_speech << child
28
+ when "gloss"
29
+ @definitions << child
30
+ when "misc"
31
+ @misc_tags << child
32
+ when "field"
33
+ @fields << child
34
+ when "xref"
35
+ @cross_references << child
36
+ when "ant"
37
+ @antonyms << child
38
+ when "stagr"
39
+ @restricted_to_readings << child.characters
40
+ when "stagk"
41
+ @restricted_to_spellings << child.characters
42
+ when "lsource"
43
+ @source_languages << child
44
+ when "dial"
45
+ @dialects << child
46
+ when "s_inf"
47
+ @comments << child.characters
48
+ end
49
+ end
50
+
51
+ def trickle_down(previous)
52
+ @parts_of_speech = previous.parts_of_speech if @parts_of_speech.empty?
53
+ @misc_tags = previous.misc_tags if @misc_tags.empty?
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,9 @@
1
+ require_relative "any"
2
+
3
+ module Eiwa
4
+ module Tag
5
+ class Other < Any
6
+ attr_reader :text
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,32 @@
1
+ require_relative "any"
2
+
3
+ module Eiwa
4
+ module Tag
5
+ class Reading < Any
6
+ attr_reader :text, :frequency_tags, :info_tags
7
+
8
+ def initialize
9
+ @frequency_tags = []
10
+ @info_tags = []
11
+ @imprecise_reading = false
12
+ end
13
+
14
+ def imprecise_reading?
15
+ @imprecise_reading
16
+ end
17
+
18
+ def end_child(child)
19
+ case child.tag_name
20
+ when "reb"
21
+ @text = child.characters
22
+ when "re_pri"
23
+ @frequency_tags << child.characters.to_sym
24
+ when "re_inf"
25
+ @info_tags << child
26
+ when "re_nokanji"
27
+ @imprecise_reading = true
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,33 @@
1
+ module Eiwa
2
+ module Tag
3
+ class SourceLanguage < Any
4
+ attr_reader :text, :language, :wasei, :type
5
+
6
+ def initialize(text: nil, language: "eng", wasei: false, type: "full")
7
+ @text = text
8
+ @language = language
9
+ @wasei = wasei
10
+ @type = type
11
+ end
12
+
13
+ def end_self
14
+ @text = @characters
15
+ @language = @attrs["xml:lang"]
16
+ @wasei = @attrs["ls_wasei"] == "y"
17
+ @type = @attrs["ls_type"] || "full"
18
+ end
19
+
20
+ def eql?(other)
21
+ @text == other.text &&
22
+ @language == other.language &&
23
+ @wasei == other.wasei &&
24
+ @type == other.type
25
+ end
26
+ alias == eql?
27
+
28
+ def hash
29
+ @text.hash + @language.hash + @wasei.hash + @type.hash
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,25 @@
1
+ require_relative "any"
2
+
3
+ module Eiwa
4
+ module Tag
5
+ class Spelling < Any
6
+ attr_reader :text, :frequency_tags, :info_tags
7
+
8
+ def initialize
9
+ @frequency_tags = []
10
+ @info_tags = []
11
+ end
12
+
13
+ def end_child(child)
14
+ case child.tag_name
15
+ when "keb"
16
+ @text = child.characters
17
+ when "ke_pri"
18
+ @frequency_tags << child.characters.to_sym
19
+ when "ke_inf"
20
+ @info_tags << child
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module Eiwa
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env bash
2
+
3
+ set -e
4
+
5
+ mkdir -p tmp/
6
+ curl http://ftp.monash.edu/pub/nihongo/JMdict_e -o tmp/jmdict.xml
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ lib = File.expand_path("../../lib", __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require "eiwa"
6
+
7
+ count = 0
8
+ Eiwa.parse_file("tmp/jmdict.xml", type: :jmdict_e) do |entry|
9
+ count += 1
10
+ end
11
+
12
+ puts "Cool. #{count} words"
metadata ADDED
@@ -0,0 +1,155 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: eiwa
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Justin Searls
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2019-09-28 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.17'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.17'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '5.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '5.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: standard
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: pry
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description:
98
+ email:
99
+ - searls@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".github/workflows/ruby.yml"
105
+ - ".gitignore"
106
+ - Gemfile
107
+ - Gemfile.lock
108
+ - LICENSE.txt
109
+ - README.md
110
+ - Rakefile
111
+ - bin/console
112
+ - bin/setup
113
+ - eiwa.gemspec
114
+ - lib/eiwa.rb
115
+ - lib/eiwa/jmdict_doc.rb
116
+ - lib/eiwa/jmdict_entities.rb
117
+ - lib/eiwa/parses_jmdict_file.rb
118
+ - lib/eiwa/tag/antonym.rb
119
+ - lib/eiwa/tag/any.rb
120
+ - lib/eiwa/tag/cross_reference.rb
121
+ - lib/eiwa/tag/definition.rb
122
+ - lib/eiwa/tag/entity.rb
123
+ - lib/eiwa/tag/entry.rb
124
+ - lib/eiwa/tag/meaning.rb
125
+ - lib/eiwa/tag/other.rb
126
+ - lib/eiwa/tag/reading.rb
127
+ - lib/eiwa/tag/source_language.rb
128
+ - lib/eiwa/tag/spelling.rb
129
+ - lib/eiwa/version.rb
130
+ - script/download_jmdict
131
+ - script/parse_jmdict
132
+ homepage: https://github.com/searls/eiwa
133
+ licenses:
134
+ - MIT
135
+ metadata: {}
136
+ post_install_message:
137
+ rdoc_options: []
138
+ require_paths:
139
+ - lib
140
+ required_ruby_version: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: '0'
145
+ required_rubygems_version: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ requirements: []
151
+ rubygems_version: 3.0.3
152
+ signing_key:
153
+ specification_version: 4
154
+ summary: Parses the JMDict Japanese-English dictionary
155
+ test_files: []