furigana 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/furigana +1 -59
- data/lib/furigana.rb +0 -4
- data/lib/furigana/formatter/base.rb +75 -35
- data/lib/furigana/formatter/html.rb +2 -5
- data/lib/furigana/formatter/json.rb +4 -4
- data/lib/furigana/formatter/text.rb +2 -5
- data/lib/furigana/formatter/yomikata.rb +12 -13
- data/lib/furigana/mecab.rb +22 -16
- data/lib/furigana/reader.rb +9 -5
- data/lib/furigana/version.rb +1 -1
- metadata +33 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6ff27abadc679f46eaf32b412c9122df44c55b29
|
4
|
+
data.tar.gz: 6c5e06f33c1dded6d583a7bb7878a89ad9973d42
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9fa6a5abd830cf2488924aef22a36ff3744706026d6a8713101118aef9636bf01785464421da019dcf601a268ff75cf531b6caff0df2e3137575829a29c72531
|
7
|
+
data.tar.gz: 3616341a08872b570b8e5fa3e0f7ae95607e0588b2fbd8def30b0b873f5fc41c2f0f3aa507b5eb15db3e64159219f01dd023ee71dde97ae211725e55df0749a2
|
data/bin/furigana
CHANGED
@@ -1,62 +1,4 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'furigana'
|
4
|
-
require 'singleton'
|
5
|
-
require 'ostruct'
|
6
|
-
require 'optparse'
|
7
|
-
|
8
|
-
module Furigana
|
9
|
-
class CLI
|
10
|
-
include Singleton
|
11
|
-
|
12
|
-
def initialize
|
13
|
-
@settings = OpenStruct.new
|
14
|
-
@settings.format = :text
|
15
|
-
end
|
16
|
-
|
17
|
-
def parse_options
|
18
|
-
OptionParser.new do |opts|
|
19
|
-
opts.banner = "Usage: furigana [options] [file]"
|
20
|
-
|
21
|
-
opts.on("--text", "Add furigana and output text (default)") do
|
22
|
-
@settings.format = :text
|
23
|
-
end
|
24
|
-
opts.on("--html", "Add furigana and output HTML") do
|
25
|
-
@settings.format = :html
|
26
|
-
end
|
27
|
-
opts.on("--yomikata", "Output yomikata only") do
|
28
|
-
@settings.format = :yomikata
|
29
|
-
end
|
30
|
-
opts.on("--json", "Add furigana and output JSON") do
|
31
|
-
@settings.format = :json
|
32
|
-
end
|
33
|
-
opts.on_tail("-h", "--help", "Show this message") do
|
34
|
-
puts opts
|
35
|
-
exit
|
36
|
-
end
|
37
|
-
opts.on_tail("--version", "Show version") do
|
38
|
-
puts Furigana::VERSION
|
39
|
-
exit
|
40
|
-
end
|
41
|
-
end.parse!
|
42
|
-
end
|
43
|
-
|
44
|
-
def start
|
45
|
-
parse_options
|
46
|
-
|
47
|
-
input = ARGF.read
|
48
|
-
case @settings.format
|
49
|
-
when :text
|
50
|
-
puts Formatter::Text.format(input, Reader.new.reading(input))
|
51
|
-
when :html
|
52
|
-
puts Formatter::HTML.format(input, Reader.new.reading(input))
|
53
|
-
when :yomikata
|
54
|
-
puts Formatter::Yomikata.format(input, Reader.new.reading(input))
|
55
|
-
when :json
|
56
|
-
puts Formatter::JSON.format(input, Reader.new.reading(input))
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
2
|
+
require 'furigana/cli'
|
61
3
|
|
62
4
|
Furigana::CLI.instance.start
|
data/lib/furigana.rb
CHANGED
@@ -1,44 +1,84 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
1
|
module Furigana
|
3
2
|
module Formatter
|
4
3
|
class Base
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
4
|
+
SURFACE_FORM, READING = 0, 1
|
5
|
+
|
6
|
+
def initialize(text, kanji_tokens)
|
7
|
+
@text = text
|
8
|
+
@kanji_tokens = kanji_tokens
|
9
|
+
end
|
10
|
+
|
11
|
+
def render
|
12
|
+
reset
|
13
|
+
|
14
|
+
@text.each_char do |char|
|
15
|
+
if no_more_kanji_tokens?
|
16
|
+
@new_text += char
|
17
|
+
next
|
18
|
+
end
|
19
|
+
|
20
|
+
@substring += char
|
21
|
+
|
22
|
+
if not_a_kanji_group_match? char
|
23
|
+
@new_text += @substring
|
24
|
+
reset_substring
|
25
|
+
next
|
14
26
|
end
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
current_token = tokens_enum.next
|
25
|
-
rescue StopIteration
|
26
|
-
current_token = nil
|
27
|
-
end
|
28
|
-
substring, pos = "", 0
|
29
|
-
else
|
30
|
-
pos += 1
|
31
|
-
end
|
32
|
-
else # not a match
|
33
|
-
new_text += substring
|
34
|
-
substring, pos = "", 0
|
35
|
-
end
|
36
|
-
else # no more tokens
|
37
|
-
new_text += char
|
38
|
-
end
|
27
|
+
|
28
|
+
if kanji_group_match?
|
29
|
+
# replace kanji group with formatting
|
30
|
+
@new_text += replacement(@current_token[SURFACE_FORM], @current_token[READING])
|
31
|
+
@current_token = next_token
|
32
|
+
reset_substring
|
33
|
+
else
|
34
|
+
# kanji token pos advances with char pos
|
35
|
+
increment_kanji_char_pos
|
39
36
|
end
|
40
|
-
new_text
|
41
37
|
end
|
38
|
+
|
39
|
+
@new_text
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def reset
|
45
|
+
@new_text = ""
|
46
|
+
reset_substring
|
47
|
+
|
48
|
+
@kanji_tokens_enum = @kanji_tokens.to_enum
|
49
|
+
@current_token = next_token
|
50
|
+
end
|
51
|
+
|
52
|
+
def reset_substring
|
53
|
+
@substring = ""
|
54
|
+
@kanji_char_pos = 0
|
55
|
+
end
|
56
|
+
|
57
|
+
def increment_kanji_char_pos
|
58
|
+
@kanji_char_pos += 1
|
59
|
+
end
|
60
|
+
|
61
|
+
def no_more_kanji_tokens?
|
62
|
+
@current_token.nil?
|
63
|
+
end
|
64
|
+
|
65
|
+
def not_a_kanji_group_match?(char)
|
66
|
+
current_kanji_char = @current_token[SURFACE_FORM][@kanji_char_pos]
|
67
|
+
char != current_kanji_char
|
68
|
+
end
|
69
|
+
|
70
|
+
def kanji_group_match?
|
71
|
+
@kanji_char_pos == (@current_token[SURFACE_FORM].length - 1)
|
72
|
+
end
|
73
|
+
|
74
|
+
def next_token
|
75
|
+
@kanji_tokens_enum.next
|
76
|
+
rescue StopIteration
|
77
|
+
nil
|
78
|
+
end
|
79
|
+
|
80
|
+
def replacement(surface_form, reading)
|
81
|
+
fail NotImplementedError, "#{self.class} must implement `#{__method__}`"
|
42
82
|
end
|
43
83
|
end
|
44
84
|
end
|
@@ -1,11 +1,8 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
1
|
module Furigana
|
3
2
|
module Formatter
|
4
3
|
class HTML < Formatter::Base
|
5
|
-
|
6
|
-
|
7
|
-
"<ruby><rb>%s</rb><rp>【</rp><rt>%s</rt><rp>】</rp></ruby>" % [surface_form, reading]
|
8
|
-
end
|
4
|
+
def replacement(surface_form, reading)
|
5
|
+
"<ruby><rb>%s</rb><rp>【</rp><rt>%s</rt><rp>】</rp></ruby>" % [surface_form, reading]
|
9
6
|
end
|
10
7
|
end
|
11
8
|
end
|
@@ -1,11 +1,8 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
1
|
module Furigana
|
3
2
|
module Formatter
|
4
3
|
class Text < Formatter::Base
|
5
|
-
|
6
|
-
|
7
|
-
"%s【%s】" % [surface_form, reading]
|
8
|
-
end
|
4
|
+
def replacement(surface_form, reading)
|
5
|
+
"%s【%s】" % [surface_form, reading]
|
9
6
|
end
|
10
7
|
end
|
11
8
|
end
|
@@ -1,22 +1,21 @@
|
|
1
|
-
|
1
|
+
require 'nkf'
|
2
|
+
|
2
3
|
module Furigana
|
3
4
|
module Formatter
|
4
5
|
class Yomikata < Formatter::Base
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
end
|
6
|
+
def replacement(surface_form, reading)
|
7
|
+
reading
|
8
|
+
end
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
def render
|
11
|
+
k2h(super)
|
12
|
+
end
|
13
13
|
|
14
|
-
|
14
|
+
private
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
16
|
+
def k2h(k)
|
17
|
+
return nil if k.nil?
|
18
|
+
NKF.nkf("-h1 -w", k)
|
20
19
|
end
|
21
20
|
end
|
22
21
|
end
|
data/lib/furigana/mecab.rb
CHANGED
@@ -1,32 +1,38 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
1
|
require 'open3'
|
3
2
|
|
4
3
|
module Furigana
|
5
4
|
class Mecab
|
6
5
|
class << self
|
7
|
-
def sanitize_text(text)
|
8
|
-
format("%s\n", text.tr("\n", ""))
|
9
|
-
end
|
10
|
-
|
11
6
|
def tokenize(text)
|
12
7
|
surface_form, reading = 0, 1
|
13
8
|
stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))
|
14
9
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
|
20
|
-
stdout.split("\n")
|
21
|
-
end
|
10
|
+
lines = split_stdout(stdout)
|
11
|
+
|
12
|
+
lines.reduce([]) do |kanji_tokens, line|
|
13
|
+
return kanji_tokens if line == 'EOS'
|
22
14
|
|
23
|
-
lines.inject([]) do |output, line|
|
24
15
|
columns = line.split("\t")
|
25
|
-
|
16
|
+
kanji_tokens << {
|
26
17
|
:surface_form => columns[surface_form],
|
27
18
|
:reading => columns[reading]
|
28
|
-
}
|
29
|
-
|
19
|
+
}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def sanitize_text(text)
|
26
|
+
format("%s\n", text.tr("\n", ""))
|
27
|
+
end
|
28
|
+
|
29
|
+
def split_stdout(stdout)
|
30
|
+
# Avoid `ArgumentError - invalid byte sequence in UTF-8`
|
31
|
+
if stdout.valid_encoding?
|
32
|
+
stdout.split("\n")
|
33
|
+
else
|
34
|
+
stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
|
35
|
+
stdout.split("\n")
|
30
36
|
end
|
31
37
|
end
|
32
38
|
end
|
data/lib/furigana/reader.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
-
|
1
|
+
require 'diff/lcs'
|
2
|
+
require 'nkf'
|
3
|
+
|
2
4
|
module Furigana
|
3
5
|
class Reader
|
4
6
|
def reading(text)
|
5
|
-
Mecab.tokenize(text).
|
6
|
-
|
7
|
-
list +=
|
7
|
+
Mecab.tokenize(text).reduce([]) do |list, token|
|
8
|
+
with_reading = add_reading(token)
|
9
|
+
list += with_reading if with_reading
|
8
10
|
list
|
9
11
|
end
|
10
12
|
end
|
@@ -20,11 +22,12 @@ module Furigana
|
|
20
22
|
Diff::LCS.sdiff(k2h(token[:surface_form]), k2h(token[:reading]))
|
21
23
|
end
|
22
24
|
|
23
|
-
def
|
25
|
+
def add_reading(token)
|
24
26
|
states = { kanji_and_yomi: '!', yomi: '+', kana: '=' }
|
25
27
|
kanji, yomi = 0, 1
|
26
28
|
|
27
29
|
list = []
|
30
|
+
|
28
31
|
if /\p{Han}/.match(token[:surface_form])
|
29
32
|
on_kanji = false
|
30
33
|
diff_token_surface_form_and_reading(token).each do |part|
|
@@ -41,6 +44,7 @@ module Furigana
|
|
41
44
|
end
|
42
45
|
end
|
43
46
|
end
|
47
|
+
|
44
48
|
list
|
45
49
|
end
|
46
50
|
end
|
data/lib/furigana/version.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: furigana
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael V. O'Brien
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-08-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: diff-lcs
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: rake
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,13 +53,27 @@ dependencies:
|
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
56
|
+
name: pry
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - ">="
|
46
60
|
- !ruby/object:Gem::Version
|
47
61
|
version: '0'
|
48
|
-
type: :
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: pry-doc
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
49
77
|
prerelease: false
|
50
78
|
version_requirements: !ruby/object:Gem::Requirement
|
51
79
|
requirements:
|
@@ -93,7 +121,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
93
121
|
version: '0'
|
94
122
|
requirements: []
|
95
123
|
rubyforge_project:
|
96
|
-
rubygems_version: 2.
|
124
|
+
rubygems_version: 2.4.3
|
97
125
|
signing_key:
|
98
126
|
specification_version: 4
|
99
127
|
summary: Add furigana to text
|