RubyGems - faker-okinawa - Versions diffs - 0.1.0 - Mend

faker-okinawa 0.1.0

Files changed (65) hide show

checksums.yaml +7 -0
data/.gitignore +9 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +4 -0
data/LICENSE.txt +21 -0
data/README.md +66 -0
data/Rakefile +2 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/faker-okinawa.gemspec +23 -0
data/lib/faker/okinawa.rb +14 -0
data/lib/faker/okinawa/address.rb +35 -0
data/lib/faker/okinawa/awamori.rb +14 -0
data/lib/faker/okinawa/base.rb +14 -0
data/lib/faker/okinawa/fish.rb +14 -0
data/lib/faker/okinawa/food.rb +14 -0
data/lib/faker/okinawa/name.rb +14 -0
data/lib/faker/okinawa/odic.rb +40 -0
data/lib/faker/okinawa/school.rb +14 -0
data/lib/faker/okinawa/version.rb +5 -0
data/o-dic/address.dic +1068 -0
data/o-dic/amami.dic +69 -0
data/o-dic/awamori.dic +809 -0
data/o-dic/base.dic +63 -0
data/o-dic/bin-dic/ATOK.zip +0 -0
data/o-dic/bin-dic/Kotoeri.dmg +0 -0
data/o-dic/bin-dic/MSIME.zip +0 -0
data/o-dic/bin-dic/MSIME2000.zip +0 -0
data/o-dic/bin-dic/MSIME2002.zip +0 -0
data/o-dic/bin-dic/MSIME2003.zip +0 -0
data/o-dic/bin-dic/MSIME2007.zip +0 -0
data/o-dic/bin-dic/VJEDelta40.zip +0 -0
data/o-dic/bin-dic/anthydic20090901.tar.bz2 +0 -0
data/o-dic/bin-dic/cannadic.tar.bz2 +0 -0
data/o-dic/bin-dic/cannadic20030407.tar.bz2 +0 -0
data/o-dic/bin-dic/cannadic20051104.tar.bz2 +0 -0
data/o-dic/bin-dic/google-20111218.zip +0 -0
data/o-dic/bus.dic +216 -0
data/o-dic/city.dic +209 -0
data/o-dic/doc/History.doc +138 -0
data/o-dic/doc/README.1ST +79 -0
data/o-dic/doc/format.txt +169 -0
data/o-dic/food.dic +524 -0
data/o-dic/geo.dic +815 -0
data/o-dic/history.dic +1078 -0
data/o-dic/island.dic +354 -0
data/o-dic/misc.dic +1166 -0
data/o-dic/name.dic +1039 -0
data/o-dic/park.dic +535 -0
data/o-dic/sakana.dic +160 -0
data/o-dic/school.dic +1068 -0
data/o-dic/script/ODIC.pm +98 -0
data/o-dic/script/ccount.pl +111 -0
data/o-dic/script/geta_checker.sh +29 -0
data/o-dic/script/oki2atk.pl +192 -0
data/o-dic/script/oki2canna.pl +131 -0
data/o-dic/script/oki2cha.pl +154 -0
data/o-dic/script/oki2jis.pl +167 -0
data/o-dic/script/oki2kotoeri.pl +127 -0
data/o-dic/script/oki2mozc.pl +156 -0
data/o-dic/script/oki2msime.pl +136 -0
data/o-dic/script/oki2osxjapaneseim.pl +168 -0
data/o-dic/script/oki2vje.pl +138 -0
data/o-dic/script/wcount.pl +114 -0
metadata +135 -0

data/o-dic/script/oki2mozc.pl ADDED Viewed

@@ -0,0 +1,156 @@
+#!/usr/bin/perl
+#
+# 沖縄辞書のファイルをMozc/Google日本語入力のユーザー辞書へ変換するスクリプト
+#							山城潤
+#
+# 使用例:
+#    $ cat ../*.dic | ./oki2mozc.pl | sort -u > mozc_okidic.txt
+#
+#    --mozc-source / -m オプションにMozcのソースツリーが指定された場合には、
+#    沖縄辞書からMozcの辞書に取り込まれている単語を除外する。ただし、
+#    品詞のチェックは行っていない。
+#    $ cat ../*.dic | ./oki2mozc.pl -m /path/to/mozc-w.x.y.z | sort -u > mozc_okidic.txt
+#
+# 品詞一覧
+# mozc-w.x.y.z/dictionary/user_dictionary_storage.proto
+# mozc-w.x.y.z/dictionary/user_dictionary_util.cc
+# mozc-w.x.y.z/data/rules/user_pos.def
+# 他の日本語変換システムとの品詞マッピング
+# mozc-w.x.y.z/data/rules/third_party_pos_map.def
+# サンプル
+# mozc-w.x.y.z/third_party/japanese_usage_dictionary/usage_dict.txt
+#
+use 5.10.1;		# for "use feature 'switch'"
+use FindBin;
+use lib $FindBin::Bin;  # For search scripts/ODIC.pm
+require 'ODIC.pm';
+use strict;
+use feature 'switch';
+use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat);
+my $with_comment;
+my $help_only;
+my $mozc_source;
+GetOptions(
+    'with-comment|c' => \$with_comment,
+    'help|h' => \$help_only,
+    'mozc-source|m=s' => \$mozc_source
+);
+if (defined($help_only)) {
+    print STDERR "usage: $0 [--with-comment|-c] [--help|-h] [--mozc-source|-m path_to_mozc_source]\n";
+    exit 1;
+}
+if (defined($mozc_source) && (! -d $mozc_source)) {
+    print STDERR "`$mozc_source': Not a directory.\n";
+    exit 2;
+}
+while (<>) {
+    next if (/^\s*$|^\s*\#.*$/);	# 空行・コメントのみの行を読み飛ばす
+    if (/^(\S+)\s+(\S+)\s+(\S+)\s+#\s*(.*)$/) {
+	my $phonate = $1;	# 読み
+	my $word    = $2;	# 単語
+	my $class   = $3;	# 品詞
+	my $comment = '';	# コメント
+	if (defined($with_comment)) {
+	    $comment = $4;	# 必要な時にはコメントを付けられる
+	    # 不要なデータをコメントから除外する
+	    $comment =~ s/\s*@@@\s*//;	# chasen 向け除外マーカー
+	    $comment =~ s/〓あり\s*\([^\)]+\)\s*//; # JIS X 0208 範囲外マーカー
+	}
+	ODIC::check_phonate($phonate);
+	ODIC::check_word($word);
+	&print_dictionary($phonate, $word, $class, $comment);
+    } elsif (/^(\S+)\s+(\S+)\s+(\S+)/) {
+	my $phonate = $1;	# 読み
+	my $word    = $2;	# 単語
+	my $class   = $3;	# 品詞
+	my $comment = '';	# コメント
+	ODIC::check_phonate($phonate);
+	ODIC::check_word($word);
+	&print_dictionary($phonate, $word, $class, $comment);
+    } else {
+	print STDERR "Error: $.: too few field number `$_'\n";
+	print  "$_";
+    }
+}
+&version;
+exit 0;
+# mozc の辞書には沖縄辞書からデータが取り込まれているため、重複を避ける。
+sub find_from_mozc_dictionary {
+    my $phonate = shift;
+    my $word = shift;
+    if (defined $mozc_source) {
+	my @dictionaries = glob "$mozc_source/data/dictionary_oss/dictionary*.txt";
+	system('egrep', '-q', "^$phonate\[[:space:]]{1}[[:digit:]]{4}[[:space:]]{1}[[:digit:]]{4}[[:space:]]{1}[[:digit:]]{4}[[:space:]]{1}$word\$", @dictionaries);
+	if ($? == 0) {
+	    return 1;	# found
+	}
+    }
+    return 0;	# not found
+}
+sub print_dictionary {
+    my $phonate = shift;
+    my $word    = shift;
+    my $class   = shift;
+    my $comment = shift;
+    given ($class) {
+	when ('普通名詞')	{ $class = '名詞'; }
+	when ('サ変名詞')	{ $class = '名詞サ変'; }
+	when ('形動名詞')	{ $class = '名詞形動'; }
+	when ('その他の人名')	{ $class = '人名'; }
+	when ('単純地名')	{ $class = '地名'; }
+	when ('接尾語付き地名')	{ $class = '接尾地名'; }
+	when ('組織名')		{ $class = '組織'; }
+	when ('その他固有名詞')	{ $class = '固有名詞'; }
+	when ('形容動詞')	{ $class = '名詞形動'; }
+	when ('数字列接頭語')	{ $class = '接頭語'; }
+	when ('接尾語')		{ $class = '接尾一般'; }
+	when ('人名接尾語')	{ $class = '接尾人名'; }
+	when ('地名接尾語')	{ $class = '接尾地名'; }
+	when ('組織名接尾語')	{ $class = '接尾一般'; }
+	when ('数字列接尾語')	{ $class = '助数詞'; }
+	when ('成句')		{ $class = '名詞'; }
+	when ('無品詞')		{ $class = '独立語'; }
+	default {
+	    # 姓、名、副詞、接続詞、感動詞、形容詞、接頭語
+	}
+    }
+    if (!find_from_mozc_dictionary($phonate, $word)) {
+	print "$phonate\t$word\t$class\t$comment\n";
+    }
+}
+sub version {
+    my $class = shift;
+    my $sec;
+    my $min;
+    my $hour;
+    my $mday;
+    my $mon;
+    my $year;
+    ($sec, $min, $hour, $mday, $mon, $year) = localtime(time());
+    $year += 1900;
+    $mon++;
+    # 「おきなわじしょのひづけ」と入力したときのみ候補に表示される。
+    # 「おきなわじしょのひづけとにゅうりょく」と入力したら候補には出てこない。
+    print "おきなわじしょのひづけ\t$year年$mon月$mday日(沖縄辞書の日付け)\t短縮よみ\t沖縄辞書生成時に作成される単語\n";
+}

data/o-dic/script/oki2msime.pl ADDED Viewed

@@ -0,0 +1,136 @@
+#!/usr/bin/perl
+#
+# oki2msime.pl - 沖縄辞書のファイルをMS-IMEで取り込めるテキスト形式に変換するスクリプト
+#							GANAHA Makoto makoto@ganaha.org
+#		$Id: oki2msime.pl,v 1.5 2008/01/16 02:26:58 ga2 Exp $
+# 使用例:
+#    以下の様に「okinawa.txt」を作成し
+#    $ cat ../*.dic | ./oki2msime.pl -g | sort -u > okinawa.txt
+#    ユーザー辞書に取り込む場合
+#     「Microsoft IME 辞書ツール 2000」を起動し「ツール(T)」->「テキスト ファイルからの登録(T)」から「okinawa.txt」を取り込んで下さい。
+#    システム辞書を作成する場合
+#     「Microsoft IME 辞書ツール 2000」を起動し「ファイル(F)」->「新規作成(N)」よりダミーのユーザー辞書を作成します。
+#     「ツール(T)」->「テキスト ファイルからの登録(T)」から「okinawa.txt」を取り込んで下さい。
+#     「ツール(T)」->「システム辞書の作成(S)」からシステム辞書を作成します。
+#     ダミーのユーザー辞書はいりませんので削除してください。
+#
+#  「尚灝王」(しょうこうおう)など、JIS X 0208の範囲外の文字が含まれる単語を
+#  辞書にインポートする場合には、UTF-8 で出力してから、Notepad.exeなどで
+#  「Unicode」(UTF-16LE BOM付き)に変換してください。
+#
+use 5.10.1;		# for "use feature 'switch'"
+use FindBin;
+use lib $FindBin::Bin;  # For search scripts/ODIC.pm
+require 'ODIC.pm';
+use strict;
+use feature 'switch';
+use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat);
+my $with_comment;
+my $help_only;
+my $skip_geta;
+my $utf8_output;
+GetOptions(
+    'with-comment|c' => \$with_comment,
+    'help|h' => \$help_only,
+    'skip-geta|g' => \$skip_geta,
+    'utf8|u'      => \$utf8_output
+);
+if (defined($help_only)) {
+    print STDERR "usage: $0 [--with-comment|-c] [--help|-h] [--skip-geta|-g] [--utf8-output|-u]\n";
+    exit 1;
+}
+while (<>) {
+    next if (/^\s*$|^\s*\#.*$/);	# 空行・コメントのみの行を読み飛ばす
+    # JIS X 0208 に含まれていない文字を含む行を読み飛ばす
+    # 読み飛ばしにはコメントに「〓あり」マーカーが必要
+    next if (defined($skip_geta) && /〓あり/);
+    if (/^(\S+)\s+(\S+)\s+(\S+)\s+#\s*(.*)$/) {
+	my $phonate = $1;	# 読み
+	my $word    = $2;	# 単語
+	my $class   = $3;	# 品詞
+	my $comment = '';	# コメント
+	if (defined($with_comment)) {
+	    $comment = $4;	# 必要な時にはコメントを付けられる
+	    # 不要なデータをコメントから除外する
+	    $comment =~ s/\s*@@@\s*//;	# chasen 向け除外マーカー
+	    $comment =~ s/〓あり\s*\([^\)]+\)\s*//; # JIS X 0208 範囲外マーカー
+	}
+	ODIC::check_phonate($phonate);
+	ODIC::check_word($word);
+	&print_dictionary($phonate, $word, $class, $comment);
+    } elsif (/^(\S+)\s+(\S+)\s+(\S+)/) {
+	my $phonate = $1;	# 読み
+	my $word    = $2;	# 単語
+	my $class   = $3;	# 品詞
+	my $comment = '';	# コメント
+	ODIC::check_phonate($phonate);
+	ODIC::check_word($word);
+	&print_dictionary($phonate, $word, $class, $comment);
+    } else {
+	print STDERR "Error: $.: too few field number `$_'\n";
+	print  "$_";
+    }
+}
+&version;
+exit 0;
+sub print_dictionary {
+    my $phonate = shift;
+    my $word    = shift;
+    my $class   = shift;
+    my $comment = shift;
+    given ($class) {
+	when ('普通名詞')	{ $class = '名詞'; }
+	when ('サ変名詞')	{ $class = 'さ変名詞'; }
+	when ('その他の人名')	{ $class = '人名'; }
+	when ('単純地名')	{ $class = '地名その他'; }
+	when ('接尾語付き地名')	{ $class = '地名接尾語'; }
+	when ('組織名')		{ $class = '固有名詞'; }
+	when ('その他固有名詞')	{ $class = '固有名詞'; }
+	when ('数字列接頭語')	{ $class = '接頭語'; }
+	when ('人名接尾語')	{ $class = '姓名接尾語'; }
+	when ('組織名接尾語')	{ $class = '接尾語'; }
+	when ('数字列接尾語')	{ $class = '接尾語'; }
+	when ('成句')		{ $class = '名詞'; }
+	when ('無品詞')		{ $class = '名詞'; }
+	default {
+	    # 形動名詞、姓、名、副詞、接続詞、感動詞、形容詞、
+	    # 接頭語、接尾語、地名接尾語
+	}
+    }
+    if (defined($utf8_output)) {
+	print "$phonate\t$word\t$class\t$comment\r\n";
+    } else {
+	print ODIC::to_shiftjis("$phonate\t$word\t$class\t$comment\r\n");
+    }
+}
+sub version {
+    my $sec;
+    my $min;
+    my $hour;
+    my $mday;
+    my $mon;
+    my $year;
+    ($sec, $min, $hour, $mday, $mon, $year) = localtime(time());
+    $year += 1900;
+    $mon++;
+    if (defined($utf8_output)) {
+	print "おきなわじしょのひづけ\t$year/$mon/$mday(沖縄辞書の日付け)\t名詞\r\n";
+    }  else {
+	print ODIC::to_shiftjis("おきなわじしょのひづけ\t$year/$mon/$mday(沖縄辞書の日付け)\t名詞\r\n");
+    }
+}

data/o-dic/script/oki2osxjapaneseim.pl ADDED Viewed

@@ -0,0 +1,168 @@
+#!/usr/bin/perl
+#
+# 沖縄辞書のファイルを Mac OS X 日本語入力メソッド(Yosemite 以降)の
+# 「追加辞書」として変換するためのスクリプト
+#
+#   $ cat *.dic | script/oki2osxjapaneseim.pl --utf8 | env LC_ALL=C sort --unique > okinawa.txt
+# 読みは20文字以内、単語は32文字以内です。
+# ことえりと違い、UTF8のCSVファイルとして変換します。
+#
+# システム環境設定の[キーボード] で日本語入力メソッドの
+# [入力ソース]タブを選択し、追加辞書リストボックスに
+# 変換したファイルをドロップする。
+use FindBin;
+use lib $FindBin::Bin;	# For search scripts/ODIC.pm
+require 5.6.0;
+require 'ODIC.pm';
+use strict;
+use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat);
+our $phonate;
+our $word;
+our $class;
+my $help_only;
+my $skip_geta;
+my $utf8;
+GetOptions(
+    'help|h' => \$help_only,
+    'skip-geta|g' => \$skip_geta,
+    'utf8|u' => \$utf8
+);
+if (defined($help_only)) {
+    print STDERR "usage: $0 [--help|-h] [--utf8|-u [--skip-geta|-g]]\n";
+    exit 1;
+}
+while (<>) {
+	# JIS X 0208 に含まれていない文字を含む行を読み飛ばす
+	# 読み飛ばしにはコメントに「〓あり」マーカーが必要
+	next if ( (defined($skip_geta) || (not defined($utf8)) ) && /〓あり/);
+	s/#.*$//;		# `#'以降を取り去る
+	next if (/^\s*$/);	# その結果空行になった行は読み飛ばす。
+	if (/(\S+)\s+(\S+)\s+(\S+)/) {
+		$phonate = $1;	# 読み
+		$word    = $2;	# 単語
+		$class   = $3;	# 品詞
+		# Yosemite では「Q's瑞穂」が「Qs瑞穂」になる。
+		$word =~ s/\'/\"\'/g;	# ' → "'   # 「Q's瑞穂」対応
+		ODIC::check_phonate($phonate);
+		if ($word !~ /\"\'/g) {
+			# check_word() では " をエラーにするので
+			# エスケープしたら避ける。
+			ODIC::check_word($word);
+		}
+		&convert_class;
+	} else {
+		print STDERR "Error: $.: too few field number `$_'\n";
+		print  "$_";
+	}
+}
+&version;
+exit 0;
+# 品詞変換規則はこちらを参考にしました。
+#
+# 【Yosemite版】JapaneseIM(旧ことえり)を操る時に便利な覚え書き(Tips集)
+# http://nadroom.dousetsu.com/kotoeri/kotoeri_yosemite.html
+sub convert_class {
+	if ($class eq "普通名詞") {			# OK
+	}
+	elsif ($class eq "サ変名詞") {			# OK
+	}
+	elsif ($class eq "形動名詞") {
+		$class = '普通名詞';
+	}
+	elsif ($class eq "姓") {
+		$class = 'その他の固有名詞';
+	}
+	elsif ($class eq "名") {
+		$class = 'その他の固有名詞';
+	}
+	elsif ($class eq "その他の人名") {
+		$class = 'その他の固有名詞';
+	}
+	elsif ($class eq "単純地名") {
+		$class = '地名';
+	}
+	elsif ($class eq "接尾語付き地名") {
+		$class = '地名';
+	}
+	elsif ($class eq "組織名") {
+		$class = 'その他の固有名詞';
+	}
+	elsif ($class eq "その他固有名詞") {
+		$class = 'その他の固有名詞';
+	}
+	elsif ($class eq "副詞") {
+	}
+	elsif ($class eq "接続詞") {
+		$class = '無品詞';
+	}
+	elsif ($class eq "感動詞") {
+		$class = '無品詞';
+	}
+	elsif ($class eq "形容詞") {			# OK
+	}
+	elsif ($class eq "形容動詞") {
+		$class = '無品詞';
+	}
+	elsif ($class eq "接頭語") {
+		$class = '普通名詞';
+	}
+	elsif ($class eq "数字列接頭語") {
+		$class = '無品詞';
+	}
+	elsif ($class eq "接尾語") {
+		$class = '普通名詞';
+	}
+	elsif ($class eq "人名接尾語") {
+		$class = '無品詞';
+	}
+	elsif ($class eq "地名接尾語") {
+		$class = '無品詞';
+	}
+	elsif ($class eq "組織名接尾語") {
+		$class = '無品詞';
+	}
+	elsif ($class eq "数字列接尾語") {
+		$class = '無品詞';
+	}
+	elsif ($class eq "成句") {
+		$class = '無品詞';
+	}
+	elsif ($class eq "無品詞") {			# OK
+	}
+	else {
+		print STDERR "Error: $.: unknown class `$class': $phonate\t$word\n";
+	}
+	if (defined($utf8)) {
+		print "$phonate,$word,$class\r\n";
+	} else {
+		print ODIC::to_shiftjis("$phonate,$word,$class\r\n");
+	}
+}
+sub version {
+	my $sec;
+	my $min;
+	my $hour;
+	my $mday;
+	my $mon;
+	my $year;
+	($sec, $min, $hour, $mday, $mon, $year) = localtime(time());
+	$year += 1900;
+	$mon++;
+	if (defined($utf8)) {
+		print "おきなわじしょのひづけ,$year年$mon月$mday日(沖縄辞書の日付け),無品詞\r\n";
+	} else {
+		print ODIC::to_shiftjis("おきなわじしょのひづけ,$year年$mon月$mday日(沖縄辞書の日付け),無品詞\r\n");
+	}
+}