furigana 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +6 -0
- data/bin/furigana +57 -0
- data/furigana.rdoc +5 -0
- data/lib/furigana.rb +8 -0
- data/lib/furigana/formatter/base.rb +41 -0
- data/lib/furigana/formatter/html.rb +12 -0
- data/lib/furigana/formatter/json.rb +11 -0
- data/lib/furigana/formatter/text.rb +12 -0
- data/lib/furigana/formatters.rb +4 -0
- data/lib/furigana/mecab.rb +25 -0
- data/lib/furigana/reader.rb +62 -0
- data/lib/furigana/version.rb +3 -0
- metadata +145 -0
data/README.rdoc
ADDED
data/bin/furigana
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
require 'furigana'
|
4
|
+
require 'singleton'
|
5
|
+
require 'ostruct'
|
6
|
+
require 'optparse'
|
7
|
+
|
8
|
+
module Furigana
|
9
|
+
class CLI
|
10
|
+
include Singleton
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@settings = OpenStruct.new
|
14
|
+
@settings.format = :text
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_options
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
opts.banner = "Usage: furigana [options] [file]"
|
20
|
+
|
21
|
+
opts.on("--text", "Add furigana and output text (default)") do
|
22
|
+
@settings.format = :text
|
23
|
+
end
|
24
|
+
opts.on("--html", "Add furigana and output HTML") do
|
25
|
+
@settings.format = :html
|
26
|
+
end
|
27
|
+
opts.on("--json", "Add furigana and output JSON") do
|
28
|
+
@settings.format = :json
|
29
|
+
end
|
30
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
31
|
+
puts opts
|
32
|
+
exit
|
33
|
+
end
|
34
|
+
opts.on_tail("--version", "Show version") do
|
35
|
+
puts Furigana::VERSION
|
36
|
+
exit
|
37
|
+
end
|
38
|
+
end.parse!
|
39
|
+
end
|
40
|
+
|
41
|
+
def start
|
42
|
+
parse_options
|
43
|
+
|
44
|
+
input = ARGF.read
|
45
|
+
case @settings.format
|
46
|
+
when :text
|
47
|
+
puts Formatter::Text.format(input, Reader.new.reading(input))
|
48
|
+
when :html
|
49
|
+
puts Formatter::HTML.format(input, Reader.new.reading(input))
|
50
|
+
when :json
|
51
|
+
puts Formatter::JSON.format(input, Reader.new.reading(input))
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
Furigana::CLI.instance.start
|
data/furigana.rdoc
ADDED
data/lib/furigana.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Furigana
|
3
|
+
module Formatter
|
4
|
+
class Base
|
5
|
+
class << self
|
6
|
+
def format(text, tokens)
|
7
|
+
surface_form, reading = 0, 1
|
8
|
+
new_text = ''
|
9
|
+
tokens_enum = tokens.to_enum
|
10
|
+
current_token = tokens_enum.next
|
11
|
+
substring, pos = "", 0
|
12
|
+
|
13
|
+
text.each_char do |char|
|
14
|
+
if current_token
|
15
|
+
substring += char
|
16
|
+
if char == current_token[surface_form][pos]
|
17
|
+
if pos == current_token[surface_form].length-1
|
18
|
+
new_text += replacement(current_token[surface_form], current_token[reading])
|
19
|
+
begin
|
20
|
+
current_token = tokens_enum.next
|
21
|
+
rescue StopIteration
|
22
|
+
current_token = nil
|
23
|
+
end
|
24
|
+
substring, pos = "", 0
|
25
|
+
else
|
26
|
+
pos += 1
|
27
|
+
end
|
28
|
+
else # not a match
|
29
|
+
new_text += substring
|
30
|
+
substring, pos = "", 0
|
31
|
+
end
|
32
|
+
else # no more tokens
|
33
|
+
new_text += char
|
34
|
+
end
|
35
|
+
end
|
36
|
+
new_text
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Furigana
|
3
|
+
module Formatter
|
4
|
+
class HTML < Formatter::Base
|
5
|
+
class << self
|
6
|
+
def replacement(surface_form, reading)
|
7
|
+
"<ruby><rb>%s</rb><rp>【</rp><rt>%s</rt><rp>】</rp></ruby>" % [surface_form, reading]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'open3'
|
3
|
+
|
4
|
+
module Furigana
|
5
|
+
class Mecab
|
6
|
+
class << self
|
7
|
+
def sanitize_text(text)
|
8
|
+
text.tr("\n", "")
|
9
|
+
end
|
10
|
+
|
11
|
+
def tokenize(text)
|
12
|
+
surface_form, reading = 0, 1
|
13
|
+
stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))
|
14
|
+
stdout.split("\n").inject([]) do |output, line|
|
15
|
+
columns = line.split("\t")
|
16
|
+
output << {
|
17
|
+
:surface_form => columns[surface_form],
|
18
|
+
:reading => columns[reading]
|
19
|
+
} if columns[surface_form] != 'EOS'
|
20
|
+
output
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Furigana
|
3
|
+
class Reader
|
4
|
+
def reading(text)
|
5
|
+
Mecab.tokenize(text).inject([]) do |list, token|
|
6
|
+
with_furigana = add_furigana(yomi_to_hiragana(token))
|
7
|
+
list.push(with_furigana) if with_furigana
|
8
|
+
list
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
def yomi_to_hiragana(token)
|
15
|
+
token[:reading] = choose_reading(token[:surface_form], token[:reading])
|
16
|
+
token
|
17
|
+
end
|
18
|
+
|
19
|
+
def k2h(k)
|
20
|
+
return nil if k.nil?
|
21
|
+
NKF.nkf("-h1 -w", k)
|
22
|
+
end
|
23
|
+
|
24
|
+
def kana?(str)
|
25
|
+
/^[ぁ-んァ-ンー]+$/.match(str)
|
26
|
+
end
|
27
|
+
|
28
|
+
def choose_reading(surface_form, reading)
|
29
|
+
!kana?(surface_form) ? k2h(reading) : surface_form
|
30
|
+
end
|
31
|
+
|
32
|
+
def sdiff(first, second)
|
33
|
+
Diff::LCS.sdiff(first, second)
|
34
|
+
end
|
35
|
+
|
36
|
+
def diff_token_surface_form_and_reading(token)
|
37
|
+
sdiff(token[:surface_form], token[:reading])
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_furigana(token)
|
41
|
+
states = { kanji_and_yomi: '!', yomi: '+', kana: '=' }
|
42
|
+
kanji, yomi = 0, 1
|
43
|
+
|
44
|
+
list = nil
|
45
|
+
on_kanji = false
|
46
|
+
diff_token_surface_form_and_reading(token).each do |part|
|
47
|
+
case part.action
|
48
|
+
when states[:kanji_and_yomi]
|
49
|
+
list = ['',''] unless on_kanji
|
50
|
+
list[kanji] += part.old_element
|
51
|
+
list[yomi] += part.new_element
|
52
|
+
on_kanji = true
|
53
|
+
when states[:yomi]
|
54
|
+
list[yomi] += part.new_element
|
55
|
+
when states[:kana]
|
56
|
+
on_kanji = false
|
57
|
+
end
|
58
|
+
end
|
59
|
+
list
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
metadata
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: furigana
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Michael V. O'Brien
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-01-19 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rdoc
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: diff-lcs
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: debugger
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: diff-lcs
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
description:
|
95
|
+
email: michael@michaelvobrien.com
|
96
|
+
executables:
|
97
|
+
- furigana
|
98
|
+
extensions: []
|
99
|
+
extra_rdoc_files:
|
100
|
+
- README.rdoc
|
101
|
+
- furigana.rdoc
|
102
|
+
files:
|
103
|
+
- bin/furigana
|
104
|
+
- lib/furigana.rb
|
105
|
+
- lib/furigana/formatter/base.rb
|
106
|
+
- lib/furigana/formatter/html.rb
|
107
|
+
- lib/furigana/formatter/json.rb
|
108
|
+
- lib/furigana/formatter/text.rb
|
109
|
+
- lib/furigana/formatters.rb
|
110
|
+
- lib/furigana/mecab.rb
|
111
|
+
- lib/furigana/reader.rb
|
112
|
+
- lib/furigana/version.rb
|
113
|
+
- README.rdoc
|
114
|
+
- furigana.rdoc
|
115
|
+
homepage: https://github.com/michaelvobrien/furigana
|
116
|
+
licenses: []
|
117
|
+
post_install_message:
|
118
|
+
rdoc_options:
|
119
|
+
- --title
|
120
|
+
- furigana
|
121
|
+
- --main
|
122
|
+
- README.rdoc
|
123
|
+
- -ri
|
124
|
+
require_paths:
|
125
|
+
- lib
|
126
|
+
- lib
|
127
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
128
|
+
none: false
|
129
|
+
requirements:
|
130
|
+
- - ! '>='
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: '0'
|
133
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
134
|
+
none: false
|
135
|
+
requirements:
|
136
|
+
- - ! '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
requirements: []
|
140
|
+
rubyforge_project:
|
141
|
+
rubygems_version: 1.8.23
|
142
|
+
signing_key:
|
143
|
+
specification_version: 3
|
144
|
+
summary: Add furigana to text
|
145
|
+
test_files: []
|