imedic-tools 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7670f860570f73ef0fdf31fe278598246eab359159af59e8ebc3c1e5bc35aaef
4
+ data.tar.gz: c472bf447d23344b7f34403f67bbc7408cc1f259662adcf0b2e29803fff382fa
5
+ SHA512:
6
+ metadata.gz: 3ba08c54418c73016822bb12f24d4d4f47791dcc0d71c52bb0f431828f3b61ef09db7b92972971526a3218f49dba2dfed5f245c9db20e32885a2fbb42c440a14
7
+ data.tar.gz: 57354a763840c018d3c1fec1bf7ec0ea4c4b798cecdde30e2a8e14d3d7033f422b2e9daf09e728e7887752f812add47891b7c8b7f9daaf5d5bf462fcaab23045
@@ -0,0 +1,10 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "github-actions"
4
+ directory: "/"
5
+ schedule:
6
+ interval: "weekly"
7
+ groups:
8
+ github-actions:
9
+ patterns:
10
+ - "*"
@@ -0,0 +1,52 @@
1
+ name: Publish gem to rubygems.org
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ permissions:
9
+ contents: read
10
+
11
+ jobs:
12
+ push:
13
+ if: github.repository == 'knu/imedic-tools'
14
+ runs-on: ubuntu-latest
15
+
16
+ environment:
17
+ name: rubygems.org
18
+ url: https://rubygems.org/gems/imedic-tools
19
+
20
+ permissions:
21
+ contents: write
22
+ id-token: write
23
+
24
+ steps:
25
+ - name: Harden Runner
26
+ uses: step-security/harden-runner@ab7a9404c0f3da075243ca237b5fac12c98deaa5 # v2.19.3
27
+ with:
28
+ egress-policy: audit
29
+
30
+ - name: Check out repository
31
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
32
+ with:
33
+ persist-credentials: false
34
+
35
+ - name: Set up Ruby
36
+ uses: ruby/setup-ruby@8aeb6ff8030dd539317f8e1769a044873b56ea71 # v1.268.0
37
+ with:
38
+ ruby-version: ruby
39
+
40
+ - name: Install dependencies
41
+ run: bundle install --jobs 4 --retry 3
42
+
43
+ - name: Publish to RubyGems
44
+ uses: rubygems/release-gem@6317d8d1f7e28c24d28f6eff169ea854948bd9f7 # v1.2.0
45
+
46
+ - name: Create GitHub release
47
+ env:
48
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
49
+ run: |
50
+ set -euo pipefail
51
+ tag_name="$(git describe --tags --abbrev=0)"
52
+ gh release create "${tag_name}" --verify-tag --generate-notes
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ /.bundle/
2
+ /.yardoc/
3
+ /coverage/
4
+ /doc/
5
+ /imedic-tools-*.gem
6
+ /pkg/
7
+ /tmp/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+ gem "rake", "~> 13.0"
data/LICENSE ADDED
@@ -0,0 +1,24 @@
1
+ Copyright (c) 2026 Akinori Musha
2
+
3
+ Redistribution and use in source and binary forms, with or without
4
+ modification, are permitted provided that the following conditions
5
+ are met:
6
+
7
+ 1. Redistributions of source code must retain the above copyright
8
+ notice, this list of conditions and the following disclaimer.
9
+
10
+ 2. Redistributions in binary form must reproduce the above copyright
11
+ notice, this list of conditions and the following disclaimer in the
12
+ documentation and/or other materials provided with the distribution.
13
+
14
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
18
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
20
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,55 @@
1
+ # imedic-tools
2
+
3
+ This gem provides command-line tools for converting Japanese input dictionary word-list files between common formats.
4
+
5
+ ## Features
6
+
7
+ - **ATOK support**: Reads and writes ATOK word-list text files
8
+ - **MS-IME support**: Reads and writes modern Microsoft IME word-list files with a Unicode header, UTF-16LE BOM, and CRLF line endings
9
+ - **Kotoeri output**: Generates UTF-8 CSV files for Kotoeri-compatible import workflows
10
+ - **Comment preservation**: Converts comments between ATOK and MS-IME comment syntax
11
+ - **Multiple input files**: Accepts one or more input files and writes a combined dictionary to standard output
12
+
13
+ ## Installation
14
+
15
+ Install the gem from RubyGems:
16
+
17
+ ```sh
18
+ gem install imedic-tools
19
+ ```
20
+
21
+ The scripts under `exe/` can also be used standalone without installing the gem.
22
+
23
+ ## Usage
24
+
25
+ This gem includes three tools:
26
+
27
+ - `atok2msime`: Converts ATOK word-list files to Microsoft IME format
28
+ - `msime2atok`: Converts Microsoft IME word-list files to ATOK format
29
+ - `atok2kotoeri`: Converts ATOK word-list files to Kotoeri CSV format
30
+
31
+ Usage is common to all tools:
32
+
33
+ ```sh
34
+ atok2msime < input.atok.txt > output.msime.txt
35
+ atok2msime input1.atok.txt input2.atok.txt > output.msime.txt
36
+ ```
37
+
38
+ Each tool accepts one or more input files, or standard input when no file is given, and writes the converted word list to standard output.
39
+
40
+ ## ATOK Input Format
41
+
42
+ Input files are expected to be ATOK word-list text files:
43
+
44
+ ```text
45
+ !!ATOK_TANGO_TEXT_HEADER_1
46
+ !! Optional comment
47
+
48
+ よみ 単語 名詞*
49
+ ```
50
+
51
+ The tools read UTF-8 files and UTF-8/UTF-16 files with a BOM.
52
+
53
+ ## License
54
+
55
+ This project is distributed under the 2-clause BSD license.
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
data/exe/atok2kotoeri ADDED
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ HINSHI_ATOK2KOTOERI = {
4
+ "カ行五段" => "無品詞",
5
+ "ガ行五段" => "無品詞",
6
+ "サ変動詞" => "無品詞",
7
+ "サ行五段" => "無品詞",
8
+ "ザ変動詞" => "無品詞",
9
+ "タ行五段" => "無品詞",
10
+ "ナ行五段" => "無品詞",
11
+ "バ行五段" => "無品詞",
12
+ "マ行五段" => "無品詞",
13
+ "ラ行五段" => "無品詞",
14
+ "ワ行五段" => "無品詞",
15
+ "一段動詞" => "無品詞",
16
+ "冠数詞" => "普通名詞",
17
+ "副詞" => "無品詞",
18
+ "助数詞" => "数字列接尾語",
19
+ "単漢字" => "無品詞",
20
+ "名詞" => "普通名詞",
21
+ "名詞サ変" => "サ変名詞",
22
+ "名詞ザ変" => "サ変名詞",
23
+ "名詞形動" => "形容動詞",
24
+ "固有一般" => "普通名詞",
25
+ "固有人名" => "人名",
26
+ "固有人姓" => "人名",
27
+ "固有商品" => "普通名詞",
28
+ "固有地名" => "地名",
29
+ "固有組織" => "普通名詞",
30
+ "形動タリ" => "形容動詞",
31
+ "形容動詞" => "形容動詞",
32
+ "形容詞" => "形容詞",
33
+ "感動詞" => "無品詞",
34
+ "接尾語" => "無品詞",
35
+ "接続詞" => "無品詞",
36
+ "接頭語" => "無品詞",
37
+ "数詞" => "普通名詞",
38
+ "独立語" => "無品詞",
39
+ "短縮読み" => "無品詞",
40
+ "連体詞" => "無品詞",
41
+ "顔文字" => "無品詞",
42
+ }
43
+
44
+ def each_atok_entry(file)
45
+ open_input(file) do |io|
46
+ io.each_line(chomp: true) do |line|
47
+ next if /^(!!|\s*$)/ === line
48
+
49
+ yomi, tango, hinshi = line.split(/\t+/)
50
+ next if hinshi.nil?
51
+
52
+ yield yomi, tango, hinshi.sub(/[*$]+\z/, "")
53
+ end
54
+ end
55
+ end
56
+
57
+ def open_input(file)
58
+ if file == "-"
59
+ setup_input(STDIN)
60
+ yield STDIN
61
+ else
62
+ File.open(file, "rb") do |io|
63
+ setup_input(io)
64
+ yield io
65
+ end
66
+ end
67
+ end
68
+
69
+ def setup_input(io)
70
+ io.binmode
71
+ if (encoding = io.set_encoding_by_bom)
72
+ io.set_encoding(encoding, Encoding::UTF_8)
73
+ else
74
+ io.set_encoding(Encoding::UTF_8)
75
+ end
76
+ end
77
+
78
+ def quote(word)
79
+ '"' << word.gsub('"', '""') << '"'
80
+ end
81
+
82
+ $stdout.set_encoding(Encoding::UTF_8)
83
+
84
+ files = ARGV.empty? ? ["-"] : ARGV
85
+
86
+ files.each do |file|
87
+ each_atok_entry(file) do |yomi, tango, hinshi|
88
+ newhinshi = HINSHI_ATOK2KOTOERI.fetch(hinshi, "無品詞")
89
+ puts [yomi, tango, newhinshi].map { |word| quote(word) }.join(",")
90
+ rescue EncodingError => e
91
+ warn "skipped: %s %s [%s]: #{e}" % [yomi, tango, hinshi]
92
+ end
93
+ end
data/exe/atok2msime ADDED
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ HINSHI_ATOK2MSIME = {
4
+ 'カ行五段' => 'か行五段',
5
+ 'ガ行五段' => 'が行五段',
6
+ 'サ変動詞' => 'さ変動詞',
7
+ 'サ行五段' => 'さ行五段',
8
+ 'ザ変動詞' => 'ざ変動詞',
9
+ 'タ行五段' => 'た行五段',
10
+ 'ナ行五段' => 'な行五段',
11
+ 'バ行五段' => 'ば行五段',
12
+ 'マ行五段' => 'ま行五段',
13
+ 'ラ行五段' => 'ら行五段',
14
+ 'ワ行五段' => 'あわ行五段',
15
+ '一段動詞' => '一段動詞',
16
+ '冠数詞' => '冠数詞',
17
+ '副詞' => '副詞',
18
+ '助数詞' => '助数詞',
19
+ '単漢字' => '単漢字',
20
+ '名詞' => '名詞',
21
+ '名詞形動' => '形容動詞',
22
+ '名詞サ変' => 'さ変名詞',
23
+ '名詞ザ変' => 'ざ変名詞',
24
+ '固有一般' => '固有名詞',
25
+ '固有人名' => '名',
26
+ '固有人姓' => '姓',
27
+ '固有商品' => '固有名詞',
28
+ '固有地名' => '地名その他',
29
+ '固有組織' => '社名',
30
+ '形動タリ' => '形容動詞タル',
31
+ '形容動詞' => '形容動詞',
32
+ '形容詞' => '形容詞',
33
+ '感動詞' => '感動詞',
34
+ '接尾語' => '接尾語',
35
+ '接続詞' => '接続詞',
36
+ '接頭語' => '接頭語',
37
+ '数詞' => '数量',
38
+ '独立語' => '独立語',
39
+ '短縮読み' => '短縮よみ',
40
+ '連体詞' => '連体詞',
41
+ '顔文字' => '顔文字',
42
+ }
43
+
44
+ def convert_atok_file(file)
45
+ open_input(file) do |io|
46
+ io.each_line(chomp: true) do |line|
47
+ case line
48
+ when /\A\s*\z|\A!!ATOK_TANGO_TEXT_HEADER_1\z/
49
+ next
50
+ when /\A!!(\s.*\z)?/
51
+ puts "!#{$1}"
52
+ next
53
+ end
54
+
55
+ yomi, tango, hinshi = line.split(/\t+/)
56
+ next if hinshi.nil?
57
+
58
+ hinshi = hinshi.sub(/[*$]+\z/, "")
59
+ newhinshi = HINSHI_ATOK2MSIME[hinshi] or
60
+ raise "unsupported hinshi"
61
+
62
+ puts [yomi, tango, newhinshi].join("\t")
63
+ rescue => e
64
+ warn "skipped: %s %s [%s]: #{e}" % [yomi, tango, hinshi]
65
+ end
66
+ end
67
+ end
68
+
69
+ def open_input(file)
70
+ if file == "-"
71
+ setup_input(STDIN)
72
+ yield STDIN
73
+ else
74
+ File.open(file, "rb") do |io|
75
+ setup_input(io)
76
+ yield io
77
+ end
78
+ end
79
+ end
80
+
81
+ def setup_input(io)
82
+ io.binmode
83
+ if (encoding = io.set_encoding_by_bom)
84
+ io.set_encoding(encoding, Encoding::UTF_8)
85
+ else
86
+ io.set_encoding(Encoding::UTF_8)
87
+ end
88
+ end
89
+
90
+ $stdout.set_encoding(Encoding::UTF_16LE, Encoding::UTF_8, crlf_newline: true)
91
+ $stdout.write("\uFEFF")
92
+
93
+ puts "!Microsoft IME Dictionary Tool",
94
+ "!Version:",
95
+ "!Format:WORDLIST",
96
+ "!DateTime: #{Time.now.strftime("%Y年%m月%d日")}",
97
+ ""
98
+
99
+ files = ARGV.empty? ? ["-"] : ARGV
100
+
101
+ files.each do |file|
102
+ convert_atok_file(file)
103
+ end
data/exe/msime2atok ADDED
@@ -0,0 +1,101 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ HINSHI_MSIME2ATOK = {
4
+ "か行五段" => "カ行五段",
5
+ "が行五段" => "ガ行五段",
6
+ "さ変動詞" => "サ変動詞",
7
+ "さ行五段" => "サ行五段",
8
+ "ざ変動詞" => "ザ変動詞",
9
+ "た行五段" => "タ行五段",
10
+ "な行五段" => "ナ行五段",
11
+ "ば行五段" => "バ行五段",
12
+ "ま行五段" => "マ行五段",
13
+ "ら行五段" => "ラ行五段",
14
+ "あわ行五段" => "ワ行五段",
15
+ "わ行五段" => "ワ行五段",
16
+ "一段動詞" => "一段動詞",
17
+ "冠数詞" => "冠数詞",
18
+ "副詞" => "副詞",
19
+ "助数詞" => "助数詞",
20
+ "単漢字" => "単漢字",
21
+ "名詞" => "名詞",
22
+ "形容動詞" => "形容動詞",
23
+ "さ変名詞" => "名詞サ変",
24
+ "ざ変名詞" => "名詞ザ変",
25
+ "固有名詞" => "固有一般",
26
+ "名" => "固有人名",
27
+ "姓" => "固有人姓",
28
+ "地名その他" => "固有地名",
29
+ "社名" => "固有組織",
30
+ "形容動詞タル" => "形動タリ",
31
+ "形容詞" => "形容詞",
32
+ "感動詞" => "感動詞",
33
+ "接尾語" => "接尾語",
34
+ "接続詞" => "接続詞",
35
+ "接頭語" => "接頭語",
36
+ "数量" => "数詞",
37
+ "独立語" => "独立語",
38
+ "短縮よみ" => "短縮読み",
39
+ "連体詞" => "連体詞",
40
+ "顔文字" => "顔文字",
41
+ }
42
+
43
+ def each_msime_line(file, &block)
44
+ open_input(file) do |io|
45
+ io.each_line(chomp: true, &block)
46
+ end
47
+ end
48
+
49
+ def open_input(file)
50
+ if file == "-"
51
+ setup_input(STDIN)
52
+ yield STDIN
53
+ else
54
+ File.open(file, "rb") do |io|
55
+ setup_input(io)
56
+ yield io
57
+ end
58
+ end
59
+ end
60
+
61
+ def setup_input(io)
62
+ io.binmode
63
+ if (encoding = io.set_encoding_by_bom)
64
+ io.set_encoding(encoding, Encoding::UTF_8)
65
+ else
66
+ io.set_encoding(Encoding::UTF_8)
67
+ end
68
+ end
69
+
70
+ def convert_msime_file(file)
71
+ each_msime_line(file) do |line|
72
+ case line
73
+ when /\A\s*\z|\A!\S/
74
+ next
75
+ when /\A!(\s.*\z)?/
76
+ puts "!!#{$1}"
77
+ next
78
+ end
79
+
80
+ yomi, tango, hinshi = line.split(/\t+/)
81
+ next if hinshi.nil?
82
+
83
+ hinshi = hinshi.sub(/[*$]+\z/, "")
84
+ newhinshi = HINSHI_MSIME2ATOK[hinshi] or
85
+ raise "unsupported hinshi"
86
+
87
+ puts [yomi, tango, "#{newhinshi}*"].join("\t")
88
+ rescue => e
89
+ warn "skipped: %s %s [%s]: #{e}" % [yomi, tango, hinshi]
90
+ end
91
+ end
92
+
93
+ $stdout.set_encoding(Encoding::UTF_8, Encoding::UTF_8, crlf_newline: true)
94
+
95
+ puts "!!ATOK_TANGO_TEXT_HEADER_1"
96
+
97
+ files = ARGV.empty? ? ["-"] : ARGV
98
+
99
+ files.each do |file|
100
+ convert_msime_file(file)
101
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/imedic/tools/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "imedic-tools"
7
+ spec.version = Imedic::Tools::VERSION
8
+ spec.authors = ["Akinori Musha"]
9
+ spec.email = ["knu@idaemons.org"]
10
+
11
+ spec.summary = "Japanese input dictionary word-list conversion tools"
12
+ spec.description = "Command-line tools that convert Japanese input dictionary word-list files between common formats."
13
+ spec.homepage = "https://github.com/knu/imedic-tools"
14
+ spec.license = "BSD-2-Clause"
15
+ spec.required_ruby_version = ">= 3.1"
16
+
17
+ spec.metadata = {
18
+ "bug_tracker_uri" => "https://github.com/knu/imedic-tools/issues",
19
+ "changelog_uri" => "https://github.com/knu/imedic-tools/releases",
20
+ "homepage_uri" => spec.homepage,
21
+ }
22
+
23
+ spec.files = Dir.chdir(__dir__) do
24
+ `git ls-files -z`.split("\x0")
25
+ end
26
+ spec.bindir = "exe"
27
+ spec.executables = ["atok2kotoeri", "atok2msime", "msime2atok"]
28
+ spec.require_paths = ["lib"]
29
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Imedic
4
+ module Tools
5
+ VERSION = "0.1.0"
6
+ end
7
+ end
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "tools/version"
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "imedic/tools"
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: imedic-tools
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Akinori Musha
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: Command-line tools that convert Japanese input dictionary word-list files
13
+ between common formats.
14
+ email:
15
+ - knu@idaemons.org
16
+ executables:
17
+ - atok2kotoeri
18
+ - atok2msime
19
+ - msime2atok
20
+ extensions: []
21
+ extra_rdoc_files: []
22
+ files:
23
+ - ".github/dependabot.yml"
24
+ - ".github/workflows/push_gem.yml"
25
+ - ".gitignore"
26
+ - Gemfile
27
+ - LICENSE
28
+ - README.md
29
+ - Rakefile
30
+ - exe/atok2kotoeri
31
+ - exe/atok2msime
32
+ - exe/msime2atok
33
+ - imedic-tools.gemspec
34
+ - lib/imedic-tools.rb
35
+ - lib/imedic/tools.rb
36
+ - lib/imedic/tools/version.rb
37
+ homepage: https://github.com/knu/imedic-tools
38
+ licenses:
39
+ - BSD-2-Clause
40
+ metadata:
41
+ bug_tracker_uri: https://github.com/knu/imedic-tools/issues
42
+ changelog_uri: https://github.com/knu/imedic-tools/releases
43
+ homepage_uri: https://github.com/knu/imedic-tools
44
+ rdoc_options: []
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: '3.1'
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ requirements: []
58
+ rubygems_version: 3.6.9
59
+ specification_version: 4
60
+ summary: Japanese input dictionary word-list conversion tools
61
+ test_files: []