furigana 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +6 -0
- data/bin/furigana +57 -0
- data/furigana.rdoc +5 -0
- data/lib/furigana.rb +8 -0
- data/lib/furigana/formatter/base.rb +41 -0
- data/lib/furigana/formatter/html.rb +12 -0
- data/lib/furigana/formatter/json.rb +11 -0
- data/lib/furigana/formatter/text.rb +12 -0
- data/lib/furigana/formatters.rb +4 -0
- data/lib/furigana/mecab.rb +25 -0
- data/lib/furigana/reader.rb +62 -0
- data/lib/furigana/version.rb +3 -0
- metadata +145 -0
data/README.rdoc
ADDED
data/bin/furigana
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
require 'furigana'
|
4
|
+
require 'singleton'
|
5
|
+
require 'ostruct'
|
6
|
+
require 'optparse'
|
7
|
+
|
8
|
+
module Furigana
|
9
|
+
class CLI
|
10
|
+
include Singleton
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@settings = OpenStruct.new
|
14
|
+
@settings.format = :text
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_options
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
opts.banner = "Usage: furigana [options] [file]"
|
20
|
+
|
21
|
+
opts.on("--text", "Add furigana and output text (default)") do
|
22
|
+
@settings.format = :text
|
23
|
+
end
|
24
|
+
opts.on("--html", "Add furigana and output HTML") do
|
25
|
+
@settings.format = :html
|
26
|
+
end
|
27
|
+
opts.on("--json", "Add furigana and output JSON") do
|
28
|
+
@settings.format = :json
|
29
|
+
end
|
30
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
31
|
+
puts opts
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
opts.on_tail("--version", "Show version") do
|
35
|
+
puts Furigana::VERSION
|
36
|
+
exit
|
37
|
+
end
|
38
|
+
end.parse!
|
39
|
+
end
|
40
|
+
|
41
|
+
def start
|
42
|
+
parse_options
|
43
|
+
|
44
|
+
input = ARGF.read
|
45
|
+
case @settings.format
|
46
|
+
when :text
|
47
|
+
puts Formatter::Text.format(input, Reader.new.reading(input))
|
48
|
+
when :html
|
49
|
+
puts Formatter::HTML.format(input, Reader.new.reading(input))
|
50
|
+
when :json
|
51
|
+
puts Formatter::JSON.format(input, Reader.new.reading(input))
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
Furigana::CLI.instance.start
|
data/furigana.rdoc
ADDED
data/lib/furigana.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Furigana
|
3
|
+
module Formatter
|
4
|
+
class Base
|
5
|
+
class << self
|
6
|
+
def format(text, tokens)
|
7
|
+
surface_form, reading = 0, 1
|
8
|
+
new_text = ''
|
9
|
+
tokens_enum = tokens.to_enum
|
10
|
+
current_token = tokens_enum.next
|
11
|
+
substring, pos = "", 0
|
12
|
+
|
13
|
+
text.each_char do |char|
|
14
|
+
if current_token
|
15
|
+
substring += char
|
16
|
+
if char == current_token[surface_form][pos]
|
17
|
+
if pos == current_token[surface_form].length-1
|
18
|
+
new_text += replacement(current_token[surface_form], current_token[reading])
|
19
|
+
begin
|
20
|
+
current_token = tokens_enum.next
|
21
|
+
rescue StopIteration
|
22
|
+
current_token = nil
|
23
|
+
end
|
24
|
+
substring, pos = "", 0
|
25
|
+
else
|
26
|
+
pos += 1
|
27
|
+
end
|
28
|
+
else # not a match
|
29
|
+
new_text += substring
|
30
|
+
substring, pos = "", 0
|
31
|
+
end
|
32
|
+
else # no more tokens
|
33
|
+
new_text += char
|
34
|
+
end
|
35
|
+
end
|
36
|
+
new_text
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Furigana
|
3
|
+
module Formatter
|
4
|
+
class HTML < Formatter::Base
|
5
|
+
class << self
|
6
|
+
def replacement(surface_form, reading)
|
7
|
+
"<ruby><rb>%s</rb><rp>【</rp><rt>%s</rt><rp>】</rp></ruby>" % [surface_form, reading]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'open3'
|
3
|
+
|
4
|
+
module Furigana
|
5
|
+
class Mecab
|
6
|
+
class << self
|
7
|
+
def sanitize_text(text)
|
8
|
+
text.tr("\n", "")
|
9
|
+
end
|
10
|
+
|
11
|
+
def tokenize(text)
|
12
|
+
surface_form, reading = 0, 1
|
13
|
+
stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))
|
14
|
+
stdout.split("\n").inject([]) do |output, line|
|
15
|
+
columns = line.split("\t")
|
16
|
+
output << {
|
17
|
+
:surface_form => columns[surface_form],
|
18
|
+
:reading => columns[reading]
|
19
|
+
} if columns[surface_form] != 'EOS'
|
20
|
+
output
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Furigana
|
3
|
+
class Reader
|
4
|
+
def reading(text)
|
5
|
+
Mecab.tokenize(text).inject([]) do |list, token|
|
6
|
+
with_furigana = add_furigana(yomi_to_hiragana(token))
|
7
|
+
list.push(with_furigana) if with_furigana
|
8
|
+
list
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
def yomi_to_hiragana(token)
|
15
|
+
token[:reading] = choose_reading(token[:surface_form], token[:reading])
|
16
|
+
token
|
17
|
+
end
|
18
|
+
|
19
|
+
def k2h(k)
|
20
|
+
return nil if k.nil?
|
21
|
+
NKF.nkf("-h1 -w", k)
|
22
|
+
end
|
23
|
+
|
24
|
+
def kana?(str)
|
25
|
+
/^[ぁ-んァ-ンー]+$/.match(str)
|
26
|
+
end
|
27
|
+
|
28
|
+
def choose_reading(surface_form, reading)
|
29
|
+
!kana?(surface_form) ? k2h(reading) : surface_form
|
30
|
+
end
|
31
|
+
|
32
|
+
def sdiff(first, second)
|
33
|
+
Diff::LCS.sdiff(first, second)
|
34
|
+
end
|
35
|
+
|
36
|
+
def diff_token_surface_form_and_reading(token)
|
37
|
+
sdiff(token[:surface_form], token[:reading])
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_furigana(token)
|
41
|
+
states = { kanji_and_yomi: '!', yomi: '+', kana: '=' }
|
42
|
+
kanji, yomi = 0, 1
|
43
|
+
|
44
|
+
list = nil
|
45
|
+
on_kanji = false
|
46
|
+
diff_token_surface_form_and_reading(token).each do |part|
|
47
|
+
case part.action
|
48
|
+
when states[:kanji_and_yomi]
|
49
|
+
list = ['',''] unless on_kanji
|
50
|
+
list[kanji] += part.old_element
|
51
|
+
list[yomi] += part.new_element
|
52
|
+
on_kanji = true
|
53
|
+
when states[:yomi]
|
54
|
+
list[yomi] += part.new_element
|
55
|
+
when states[:kana]
|
56
|
+
on_kanji = false
|
57
|
+
end
|
58
|
+
end
|
59
|
+
list
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
metadata
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: furigana
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Michael V. O'Brien
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-01-19 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rdoc
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: diff-lcs
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: debugger
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: diff-lcs
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
description:
|
95
|
+
email: michael@michaelvobrien.com
|
96
|
+
executables:
|
97
|
+
- furigana
|
98
|
+
extensions: []
|
99
|
+
extra_rdoc_files:
|
100
|
+
- README.rdoc
|
101
|
+
- furigana.rdoc
|
102
|
+
files:
|
103
|
+
- bin/furigana
|
104
|
+
- lib/furigana.rb
|
105
|
+
- lib/furigana/formatter/base.rb
|
106
|
+
- lib/furigana/formatter/html.rb
|
107
|
+
- lib/furigana/formatter/json.rb
|
108
|
+
- lib/furigana/formatter/text.rb
|
109
|
+
- lib/furigana/formatters.rb
|
110
|
+
- lib/furigana/mecab.rb
|
111
|
+
- lib/furigana/reader.rb
|
112
|
+
- lib/furigana/version.rb
|
113
|
+
- README.rdoc
|
114
|
+
- furigana.rdoc
|
115
|
+
homepage: https://github.com/michaelvobrien/furigana
|
116
|
+
licenses: []
|
117
|
+
post_install_message:
|
118
|
+
rdoc_options:
|
119
|
+
- --title
|
120
|
+
- furigana
|
121
|
+
- --main
|
122
|
+
- README.rdoc
|
123
|
+
- -ri
|
124
|
+
require_paths:
|
125
|
+
- lib
|
126
|
+
- lib
|
127
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
128
|
+
none: false
|
129
|
+
requirements:
|
130
|
+
- - ! '>='
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '0'
|
133
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
134
|
+
none: false
|
135
|
+
requirements:
|
136
|
+
- - ! '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
requirements: []
|
140
|
+
rubyforge_project:
|
141
|
+
rubygems_version: 1.8.23
|
142
|
+
signing_key:
|
143
|
+
specification_version: 3
|
144
|
+
summary: Add furigana to text
|
145
|
+
test_files: []
|