furigana 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/furigana +1 -59
- data/lib/furigana.rb +0 -4
- data/lib/furigana/formatter/base.rb +75 -35
- data/lib/furigana/formatter/html.rb +2 -5
- data/lib/furigana/formatter/json.rb +4 -4
- data/lib/furigana/formatter/text.rb +2 -5
- data/lib/furigana/formatter/yomikata.rb +12 -13
- data/lib/furigana/mecab.rb +22 -16
- data/lib/furigana/reader.rb +9 -5
- data/lib/furigana/version.rb +1 -1
- metadata +33 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6ff27abadc679f46eaf32b412c9122df44c55b29
|
4
|
+
data.tar.gz: 6c5e06f33c1dded6d583a7bb7878a89ad9973d42
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9fa6a5abd830cf2488924aef22a36ff3744706026d6a8713101118aef9636bf01785464421da019dcf601a268ff75cf531b6caff0df2e3137575829a29c72531
|
7
|
+
data.tar.gz: 3616341a08872b570b8e5fa3e0f7ae95607e0588b2fbd8def30b0b873f5fc41c2f0f3aa507b5eb15db3e64159219f01dd023ee71dde97ae211725e55df0749a2
|
data/bin/furigana
CHANGED
@@ -1,62 +1,4 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'furigana'
|
4
|
-
require 'singleton'
|
5
|
-
require 'ostruct'
|
6
|
-
require 'optparse'
|
7
|
-
|
8
|
-
module Furigana
|
9
|
-
class CLI
|
10
|
-
include Singleton
|
11
|
-
|
12
|
-
def initialize
|
13
|
-
@settings = OpenStruct.new
|
14
|
-
@settings.format = :text
|
15
|
-
end
|
16
|
-
|
17
|
-
def parse_options
|
18
|
-
OptionParser.new do |opts|
|
19
|
-
opts.banner = "Usage: furigana [options] [file]"
|
20
|
-
|
21
|
-
opts.on("--text", "Add furigana and output text (default)") do
|
22
|
-
@settings.format = :text
|
23
|
-
end
|
24
|
-
opts.on("--html", "Add furigana and output HTML") do
|
25
|
-
@settings.format = :html
|
26
|
-
end
|
27
|
-
opts.on("--yomikata", "Output yomikata only") do
|
28
|
-
@settings.format = :yomikata
|
29
|
-
end
|
30
|
-
opts.on("--json", "Add furigana and output JSON") do
|
31
|
-
@settings.format = :json
|
32
|
-
end
|
33
|
-
opts.on_tail("-h", "--help", "Show this message") do
|
34
|
-
puts opts
|
35
|
-
exit
|
36
|
-
end
|
37
|
-
opts.on_tail("--version", "Show version") do
|
38
|
-
puts Furigana::VERSION
|
39
|
-
exit
|
40
|
-
end
|
41
|
-
end.parse!
|
42
|
-
end
|
43
|
-
|
44
|
-
def start
|
45
|
-
parse_options
|
46
|
-
|
47
|
-
input = ARGF.read
|
48
|
-
case @settings.format
|
49
|
-
when :text
|
50
|
-
puts Formatter::Text.format(input, Reader.new.reading(input))
|
51
|
-
when :html
|
52
|
-
puts Formatter::HTML.format(input, Reader.new.reading(input))
|
53
|
-
when :yomikata
|
54
|
-
puts Formatter::Yomikata.format(input, Reader.new.reading(input))
|
55
|
-
when :json
|
56
|
-
puts Formatter::JSON.format(input, Reader.new.reading(input))
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
2
|
+
require 'furigana/cli'
|
61
3
|
|
62
4
|
Furigana::CLI.instance.start
|
data/lib/furigana.rb
CHANGED
@@ -1,44 +1,84 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
1
|
module Furigana
|
3
2
|
module Formatter
|
4
3
|
class Base
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
4
|
+
SURFACE_FORM, READING = 0, 1
|
5
|
+
|
6
|
+
def initialize(text, kanji_tokens)
|
7
|
+
@text = text
|
8
|
+
@kanji_tokens = kanji_tokens
|
9
|
+
end
|
10
|
+
|
11
|
+
def render
|
12
|
+
reset
|
13
|
+
|
14
|
+
@text.each_char do |char|
|
15
|
+
if no_more_kanji_tokens?
|
16
|
+
@new_text += char
|
17
|
+
next
|
18
|
+
end
|
19
|
+
|
20
|
+
@substring += char
|
21
|
+
|
22
|
+
if not_a_kanji_group_match? char
|
23
|
+
@new_text += @substring
|
24
|
+
reset_substring
|
25
|
+
next
|
14
26
|
end
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
current_token = tokens_enum.next
|
25
|
-
rescue StopIteration
|
26
|
-
current_token = nil
|
27
|
-
end
|
28
|
-
substring, pos = "", 0
|
29
|
-
else
|
30
|
-
pos += 1
|
31
|
-
end
|
32
|
-
else # not a match
|
33
|
-
new_text += substring
|
34
|
-
substring, pos = "", 0
|
35
|
-
end
|
36
|
-
else # no more tokens
|
37
|
-
new_text += char
|
38
|
-
end
|
27
|
+
|
28
|
+
if kanji_group_match?
|
29
|
+
# replace kanji group with formatting
|
30
|
+
@new_text += replacement(@current_token[SURFACE_FORM], @current_token[READING])
|
31
|
+
@current_token = next_token
|
32
|
+
reset_substring
|
33
|
+
else
|
34
|
+
# kanji token pos advances with char pos
|
35
|
+
increment_kanji_char_pos
|
39
36
|
end
|
40
|
-
new_text
|
41
37
|
end
|
38
|
+
|
39
|
+
@new_text
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def reset
|
45
|
+
@new_text = ""
|
46
|
+
reset_substring
|
47
|
+
|
48
|
+
@kanji_tokens_enum = @kanji_tokens.to_enum
|
49
|
+
@current_token = next_token
|
50
|
+
end
|
51
|
+
|
52
|
+
def reset_substring
|
53
|
+
@substring = ""
|
54
|
+
@kanji_char_pos = 0
|
55
|
+
end
|
56
|
+
|
57
|
+
def increment_kanji_char_pos
|
58
|
+
@kanji_char_pos += 1
|
59
|
+
end
|
60
|
+
|
61
|
+
def no_more_kanji_tokens?
|
62
|
+
@current_token.nil?
|
63
|
+
end
|
64
|
+
|
65
|
+
def not_a_kanji_group_match?(char)
|
66
|
+
current_kanji_char = @current_token[SURFACE_FORM][@kanji_char_pos]
|
67
|
+
char != current_kanji_char
|
68
|
+
end
|
69
|
+
|
70
|
+
def kanji_group_match?
|
71
|
+
@kanji_char_pos == (@current_token[SURFACE_FORM].length - 1)
|
72
|
+
end
|
73
|
+
|
74
|
+
def next_token
|
75
|
+
@kanji_tokens_enum.next
|
76
|
+
rescue StopIteration
|
77
|
+
nil
|
78
|
+
end
|
79
|
+
|
80
|
+
def replacement(surface_form, reading)
|
81
|
+
fail NotImplementedError, "#{self.class} must implement `#{__method__}`"
|
42
82
|
end
|
43
83
|
end
|
44
84
|
end
|
@@ -1,11 +1,8 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
1
|
module Furigana
|
3
2
|
module Formatter
|
4
3
|
class HTML < Formatter::Base
|
5
|
-
|
6
|
-
|
7
|
-
"<ruby><rb>%s</rb><rp>【</rp><rt>%s</rt><rp>】</rp></ruby>" % [surface_form, reading]
|
8
|
-
end
|
4
|
+
def replacement(surface_form, reading)
|
5
|
+
"<ruby><rb>%s</rb><rp>【</rp><rt>%s</rt><rp>】</rp></ruby>" % [surface_form, reading]
|
9
6
|
end
|
10
7
|
end
|
11
8
|
end
|
@@ -1,11 +1,8 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
1
|
module Furigana
|
3
2
|
module Formatter
|
4
3
|
class Text < Formatter::Base
|
5
|
-
|
6
|
-
|
7
|
-
"%s【%s】" % [surface_form, reading]
|
8
|
-
end
|
4
|
+
def replacement(surface_form, reading)
|
5
|
+
"%s【%s】" % [surface_form, reading]
|
9
6
|
end
|
10
7
|
end
|
11
8
|
end
|
@@ -1,22 +1,21 @@
|
|
1
|
-
|
1
|
+
require 'nkf'
|
2
|
+
|
2
3
|
module Furigana
|
3
4
|
module Formatter
|
4
5
|
class Yomikata < Formatter::Base
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
end
|
6
|
+
def replacement(surface_form, reading)
|
7
|
+
reading
|
8
|
+
end
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
def render
|
11
|
+
k2h(super)
|
12
|
+
end
|
13
13
|
|
14
|
-
|
14
|
+
private
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
16
|
+
def k2h(k)
|
17
|
+
return nil if k.nil?
|
18
|
+
NKF.nkf("-h1 -w", k)
|
20
19
|
end
|
21
20
|
end
|
22
21
|
end
|
data/lib/furigana/mecab.rb
CHANGED
@@ -1,32 +1,38 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
1
|
require 'open3'
|
3
2
|
|
4
3
|
module Furigana
|
5
4
|
class Mecab
|
6
5
|
class << self
|
7
|
-
def sanitize_text(text)
|
8
|
-
format("%s\n", text.tr("\n", ""))
|
9
|
-
end
|
10
|
-
|
11
6
|
def tokenize(text)
|
12
7
|
surface_form, reading = 0, 1
|
13
8
|
stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))
|
14
9
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
|
20
|
-
stdout.split("\n")
|
21
|
-
end
|
10
|
+
lines = split_stdout(stdout)
|
11
|
+
|
12
|
+
lines.reduce([]) do |kanji_tokens, line|
|
13
|
+
return kanji_tokens if line == 'EOS'
|
22
14
|
|
23
|
-
lines.inject([]) do |output, line|
|
24
15
|
columns = line.split("\t")
|
25
|
-
|
16
|
+
kanji_tokens << {
|
26
17
|
:surface_form => columns[surface_form],
|
27
18
|
:reading => columns[reading]
|
28
|
-
}
|
29
|
-
|
19
|
+
}
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def sanitize_text(text)
|
26
|
+
format("%s\n", text.tr("\n", ""))
|
27
|
+
end
|
28
|
+
|
29
|
+
def split_stdout(stdout)
|
30
|
+
# Avoid `ArgumentError - invalid byte sequence in UTF-8`
|
31
|
+
if stdout.valid_encoding?
|
32
|
+
stdout.split("\n")
|
33
|
+
else
|
34
|
+
stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
|
35
|
+
stdout.split("\n")
|
30
36
|
end
|
31
37
|
end
|
32
38
|
end
|
data/lib/furigana/reader.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
-
|
1
|
+
require 'diff/lcs'
|
2
|
+
require 'nkf'
|
3
|
+
|
2
4
|
module Furigana
|
3
5
|
class Reader
|
4
6
|
def reading(text)
|
5
|
-
Mecab.tokenize(text).
|
6
|
-
|
7
|
-
list +=
|
7
|
+
Mecab.tokenize(text).reduce([]) do |list, token|
|
8
|
+
with_reading = add_reading(token)
|
9
|
+
list += with_reading if with_reading
|
8
10
|
list
|
9
11
|
end
|
10
12
|
end
|
@@ -20,11 +22,12 @@ module Furigana
|
|
20
22
|
Diff::LCS.sdiff(k2h(token[:surface_form]), k2h(token[:reading]))
|
21
23
|
end
|
22
24
|
|
23
|
-
def
|
25
|
+
def add_reading(token)
|
24
26
|
states = { kanji_and_yomi: '!', yomi: '+', kana: '=' }
|
25
27
|
kanji, yomi = 0, 1
|
26
28
|
|
27
29
|
list = []
|
30
|
+
|
28
31
|
if /\p{Han}/.match(token[:surface_form])
|
29
32
|
on_kanji = false
|
30
33
|
diff_token_surface_form_and_reading(token).each do |part|
|
@@ -41,6 +44,7 @@ module Furigana
|
|
41
44
|
end
|
42
45
|
end
|
43
46
|
end
|
47
|
+
|
44
48
|
list
|
45
49
|
end
|
46
50
|
end
|
data/lib/furigana/version.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: furigana
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael V. O'Brien
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-08-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: diff-lcs
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
- !ruby/object:Gem::Dependency
|
14
28
|
name: rake
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,13 +53,27 @@ dependencies:
|
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
56
|
+
name: pry
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - ">="
|
46
60
|
- !ruby/object:Gem::Version
|
47
61
|
version: '0'
|
48
|
-
type: :
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: pry-doc
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
49
77
|
prerelease: false
|
50
78
|
version_requirements: !ruby/object:Gem::Requirement
|
51
79
|
requirements:
|
@@ -93,7 +121,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
93
121
|
version: '0'
|
94
122
|
requirements: []
|
95
123
|
rubyforge_project:
|
96
|
-
rubygems_version: 2.
|
124
|
+
rubygems_version: 2.4.3
|
97
125
|
signing_key:
|
98
126
|
specification_version: 4
|
99
127
|
summary: Add furigana to text
|