furigana 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bbcd137eebeed04164e7f888a408d6bc06892257
4
- data.tar.gz: 80e9c71faddc7cf50b096beef50ec686b277f192
3
+ metadata.gz: 6ff27abadc679f46eaf32b412c9122df44c55b29
4
+ data.tar.gz: 6c5e06f33c1dded6d583a7bb7878a89ad9973d42
5
5
  SHA512:
6
- metadata.gz: 221f507c9f155a7a24628cf42824e647dc2100f31bb7149461c0e305563c9c193a2db49ba02b5e6e8b3d178ef933996b943e58fb1809767d7364943d13818d0f
7
- data.tar.gz: 58f004781c6b5cb4fc8ae0fe88943648492fad393d4db2048b5440f65db433a9a5893599b4782be88c915815a49eac6175660b83a7a6213bfa87c03898668922
6
+ metadata.gz: 9fa6a5abd830cf2488924aef22a36ff3744706026d6a8713101118aef9636bf01785464421da019dcf601a268ff75cf531b6caff0df2e3137575829a29c72531
7
+ data.tar.gz: 3616341a08872b570b8e5fa3e0f7ae95607e0588b2fbd8def30b0b873f5fc41c2f0f3aa507b5eb15db3e64159219f01dd023ee71dde97ae211725e55df0749a2
@@ -1,62 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
- require 'furigana'
4
- require 'singleton'
5
- require 'ostruct'
6
- require 'optparse'
7
-
8
- module Furigana
9
- class CLI
10
- include Singleton
11
-
12
- def initialize
13
- @settings = OpenStruct.new
14
- @settings.format = :text
15
- end
16
-
17
- def parse_options
18
- OptionParser.new do |opts|
19
- opts.banner = "Usage: furigana [options] [file]"
20
-
21
- opts.on("--text", "Add furigana and output text (default)") do
22
- @settings.format = :text
23
- end
24
- opts.on("--html", "Add furigana and output HTML") do
25
- @settings.format = :html
26
- end
27
- opts.on("--yomikata", "Output yomikata only") do
28
- @settings.format = :yomikata
29
- end
30
- opts.on("--json", "Add furigana and output JSON") do
31
- @settings.format = :json
32
- end
33
- opts.on_tail("-h", "--help", "Show this message") do
34
- puts opts
35
- exit
36
- end
37
- opts.on_tail("--version", "Show version") do
38
- puts Furigana::VERSION
39
- exit
40
- end
41
- end.parse!
42
- end
43
-
44
- def start
45
- parse_options
46
-
47
- input = ARGF.read
48
- case @settings.format
49
- when :text
50
- puts Formatter::Text.format(input, Reader.new.reading(input))
51
- when :html
52
- puts Formatter::HTML.format(input, Reader.new.reading(input))
53
- when :yomikata
54
- puts Formatter::Yomikata.format(input, Reader.new.reading(input))
55
- when :json
56
- puts Formatter::JSON.format(input, Reader.new.reading(input))
57
- end
58
- end
59
- end
60
- end
2
+ require 'furigana/cli'
61
3
 
62
4
  Furigana::CLI.instance.start
@@ -1,7 +1,3 @@
1
- require 'diff/lcs'
2
- require 'nkf'
3
- require 'json'
4
-
5
1
  require 'furigana/version'
6
2
  require 'furigana/mecab'
7
3
  require 'furigana/reader'
@@ -1,44 +1,84 @@
1
- # -*- coding: utf-8 -*-
2
1
  module Furigana
3
2
  module Formatter
4
3
  class Base
5
- class << self
6
- def format(text, tokens)
7
- surface_form, reading = 0, 1
8
- new_text = ''
9
- tokens_enum = tokens.to_enum
10
- begin
11
- current_token = tokens_enum.next
12
- rescue StopIteration
13
- current_token = nil
4
+ SURFACE_FORM, READING = 0, 1
5
+
6
+ def initialize(text, kanji_tokens)
7
+ @text = text
8
+ @kanji_tokens = kanji_tokens
9
+ end
10
+
11
+ def render
12
+ reset
13
+
14
+ @text.each_char do |char|
15
+ if no_more_kanji_tokens?
16
+ @new_text += char
17
+ next
18
+ end
19
+
20
+ @substring += char
21
+
22
+ if not_a_kanji_group_match? char
23
+ @new_text += @substring
24
+ reset_substring
25
+ next
14
26
  end
15
- substring, pos = "", 0
16
-
17
- text.each_char do |char|
18
- if current_token
19
- substring += char
20
- if char == current_token[surface_form][pos]
21
- if pos == current_token[surface_form].length-1
22
- new_text += replacement(current_token[surface_form], current_token[reading])
23
- begin
24
- current_token = tokens_enum.next
25
- rescue StopIteration
26
- current_token = nil
27
- end
28
- substring, pos = "", 0
29
- else
30
- pos += 1
31
- end
32
- else # not a match
33
- new_text += substring
34
- substring, pos = "", 0
35
- end
36
- else # no more tokens
37
- new_text += char
38
- end
27
+
28
+ if kanji_group_match?
29
+ # replace kanji group with formatting
30
+ @new_text += replacement(@current_token[SURFACE_FORM], @current_token[READING])
31
+ @current_token = next_token
32
+ reset_substring
33
+ else
34
+ # kanji token pos advances with char pos
35
+ increment_kanji_char_pos
39
36
  end
40
- new_text
41
37
  end
38
+
39
+ @new_text
40
+ end
41
+
42
+ private
43
+
44
+ def reset
45
+ @new_text = ""
46
+ reset_substring
47
+
48
+ @kanji_tokens_enum = @kanji_tokens.to_enum
49
+ @current_token = next_token
50
+ end
51
+
52
+ def reset_substring
53
+ @substring = ""
54
+ @kanji_char_pos = 0
55
+ end
56
+
57
+ def increment_kanji_char_pos
58
+ @kanji_char_pos += 1
59
+ end
60
+
61
+ def no_more_kanji_tokens?
62
+ @current_token.nil?
63
+ end
64
+
65
+ def not_a_kanji_group_match?(char)
66
+ current_kanji_char = @current_token[SURFACE_FORM][@kanji_char_pos]
67
+ char != current_kanji_char
68
+ end
69
+
70
+ def kanji_group_match?
71
+ @kanji_char_pos == (@current_token[SURFACE_FORM].length - 1)
72
+ end
73
+
74
+ def next_token
75
+ @kanji_tokens_enum.next
76
+ rescue StopIteration
77
+ nil
78
+ end
79
+
80
+ def replacement(surface_form, reading)
81
+ fail NotImplementedError, "#{self.class} must implement `#{__method__}`"
42
82
  end
43
83
  end
44
84
  end
@@ -1,11 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
1
  module Furigana
3
2
  module Formatter
4
3
  class HTML < Formatter::Base
5
- class << self
6
- def replacement(surface_form, reading)
7
- "<ruby><rb>%s</rb><rp>【</rp><rt>%s</rt><rp>】</rp></ruby>" % [surface_form, reading]
8
- end
4
+ def replacement(surface_form, reading)
5
+ "<ruby><rb>%s</rb><rp>【</rp><rt>%s</rt><rp>】</rp></ruby>" % [surface_form, reading]
9
6
  end
10
7
  end
11
8
  end
@@ -1,10 +1,10 @@
1
+ require 'json'
2
+
1
3
  module Furigana
2
4
  module Formatter
3
5
  class JSON < Formatter::Base
4
- class << self
5
- def format(text, tokens)
6
- tokens.to_json
7
- end
6
+ def render
7
+ @kanji_tokens.to_json
8
8
  end
9
9
  end
10
10
  end
@@ -1,11 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
1
  module Furigana
3
2
  module Formatter
4
3
  class Text < Formatter::Base
5
- class << self
6
- def replacement(surface_form, reading)
7
- "%s【%s】" % [surface_form, reading]
8
- end
4
+ def replacement(surface_form, reading)
5
+ "%s【%s】" % [surface_form, reading]
9
6
  end
10
7
  end
11
8
  end
@@ -1,22 +1,21 @@
1
- # -*- coding: utf-8 -*-
1
+ require 'nkf'
2
+
2
3
  module Furigana
3
4
  module Formatter
4
5
  class Yomikata < Formatter::Base
5
- class << self
6
- def replacement(surface_form, reading)
7
- reading
8
- end
6
+ def replacement(surface_form, reading)
7
+ reading
8
+ end
9
9
 
10
- def format(text, tokens)
11
- k2h(super)
12
- end
10
+ def render
11
+ k2h(super)
12
+ end
13
13
 
14
- private
14
+ private
15
15
 
16
- def k2h(k)
17
- return nil if k.nil?
18
- NKF.nkf("-h1 -w", k)
19
- end
16
+ def k2h(k)
17
+ return nil if k.nil?
18
+ NKF.nkf("-h1 -w", k)
20
19
  end
21
20
  end
22
21
  end
@@ -1,32 +1,38 @@
1
- # -*- coding: utf-8 -*-
2
1
  require 'open3'
3
2
 
4
3
  module Furigana
5
4
  class Mecab
6
5
  class << self
7
- def sanitize_text(text)
8
- format("%s\n", text.tr("\n", ""))
9
- end
10
-
11
6
  def tokenize(text)
12
7
  surface_form, reading = 0, 1
13
8
  stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))
14
9
 
15
- # Avoid `ArgumentError - invalid byte sequence in UTF-8`
16
- lines = if stdout.valid_encoding?
17
- stdout.split("\n")
18
- else
19
- stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
20
- stdout.split("\n")
21
- end
10
+ lines = split_stdout(stdout)
11
+
12
+ lines.reduce([]) do |kanji_tokens, line|
13
+ return kanji_tokens if line == 'EOS'
22
14
 
23
- lines.inject([]) do |output, line|
24
15
  columns = line.split("\t")
25
- output << {
16
+ kanji_tokens << {
26
17
  :surface_form => columns[surface_form],
27
18
  :reading => columns[reading]
28
- } if columns[surface_form] != 'EOS'
29
- output
19
+ }
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def sanitize_text(text)
26
+ format("%s\n", text.tr("\n", ""))
27
+ end
28
+
29
+ def split_stdout(stdout)
30
+ # Avoid `ArgumentError - invalid byte sequence in UTF-8`
31
+ if stdout.valid_encoding?
32
+ stdout.split("\n")
33
+ else
34
+ stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
35
+ stdout.split("\n")
30
36
  end
31
37
  end
32
38
  end
@@ -1,10 +1,12 @@
1
- # -*- coding: utf-8 -*-
1
+ require 'diff/lcs'
2
+ require 'nkf'
3
+
2
4
  module Furigana
3
5
  class Reader
4
6
  def reading(text)
5
- Mecab.tokenize(text).inject([]) do |list, token|
6
- with_furigana = add_furigana(token)
7
- list += with_furigana if with_furigana
7
+ Mecab.tokenize(text).reduce([]) do |list, token|
8
+ with_reading = add_reading(token)
9
+ list += with_reading if with_reading
8
10
  list
9
11
  end
10
12
  end
@@ -20,11 +22,12 @@ module Furigana
20
22
  Diff::LCS.sdiff(k2h(token[:surface_form]), k2h(token[:reading]))
21
23
  end
22
24
 
23
- def add_furigana(token)
25
+ def add_reading(token)
24
26
  states = { kanji_and_yomi: '!', yomi: '+', kana: '=' }
25
27
  kanji, yomi = 0, 1
26
28
 
27
29
  list = []
30
+
28
31
  if /\p{Han}/.match(token[:surface_form])
29
32
  on_kanji = false
30
33
  diff_token_surface_form_and_reading(token).each do |part|
@@ -41,6 +44,7 @@ module Furigana
41
44
  end
42
45
  end
43
46
  end
47
+
44
48
  list
45
49
  end
46
50
  end
@@ -1,3 +1,3 @@
1
1
  module Furigana
2
- VERSION = '0.0.6'
2
+ VERSION = '0.0.7'
3
3
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: furigana
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael V. O'Brien
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-25 00:00:00.000000000 Z
11
+ date: 2016-08-17 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: diff-lcs
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: rake
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -39,13 +53,27 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
- name: diff-lcs
56
+ name: pry
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - ">="
46
60
  - !ruby/object:Gem::Version
47
61
  version: '0'
48
- type: :runtime
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry-doc
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
49
77
  prerelease: false
50
78
  version_requirements: !ruby/object:Gem::Requirement
51
79
  requirements:
@@ -93,7 +121,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
121
  version: '0'
94
122
  requirements: []
95
123
  rubyforge_project:
96
- rubygems_version: 2.2.2
124
+ rubygems_version: 2.4.3
97
125
  signing_key:
98
126
  specification_version: 4
99
127
  summary: Add furigana to text