furigana 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bbcd137eebeed04164e7f888a408d6bc06892257
4
- data.tar.gz: 80e9c71faddc7cf50b096beef50ec686b277f192
3
+ metadata.gz: 6ff27abadc679f46eaf32b412c9122df44c55b29
4
+ data.tar.gz: 6c5e06f33c1dded6d583a7bb7878a89ad9973d42
5
5
  SHA512:
6
- metadata.gz: 221f507c9f155a7a24628cf42824e647dc2100f31bb7149461c0e305563c9c193a2db49ba02b5e6e8b3d178ef933996b943e58fb1809767d7364943d13818d0f
7
- data.tar.gz: 58f004781c6b5cb4fc8ae0fe88943648492fad393d4db2048b5440f65db433a9a5893599b4782be88c915815a49eac6175660b83a7a6213bfa87c03898668922
6
+ metadata.gz: 9fa6a5abd830cf2488924aef22a36ff3744706026d6a8713101118aef9636bf01785464421da019dcf601a268ff75cf531b6caff0df2e3137575829a29c72531
7
+ data.tar.gz: 3616341a08872b570b8e5fa3e0f7ae95607e0588b2fbd8def30b0b873f5fc41c2f0f3aa507b5eb15db3e64159219f01dd023ee71dde97ae211725e55df0749a2
@@ -1,62 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
- require 'furigana'
4
- require 'singleton'
5
- require 'ostruct'
6
- require 'optparse'
7
-
8
- module Furigana
9
- class CLI
10
- include Singleton
11
-
12
- def initialize
13
- @settings = OpenStruct.new
14
- @settings.format = :text
15
- end
16
-
17
- def parse_options
18
- OptionParser.new do |opts|
19
- opts.banner = "Usage: furigana [options] [file]"
20
-
21
- opts.on("--text", "Add furigana and output text (default)") do
22
- @settings.format = :text
23
- end
24
- opts.on("--html", "Add furigana and output HTML") do
25
- @settings.format = :html
26
- end
27
- opts.on("--yomikata", "Output yomikata only") do
28
- @settings.format = :yomikata
29
- end
30
- opts.on("--json", "Add furigana and output JSON") do
31
- @settings.format = :json
32
- end
33
- opts.on_tail("-h", "--help", "Show this message") do
34
- puts opts
35
- exit
36
- end
37
- opts.on_tail("--version", "Show version") do
38
- puts Furigana::VERSION
39
- exit
40
- end
41
- end.parse!
42
- end
43
-
44
- def start
45
- parse_options
46
-
47
- input = ARGF.read
48
- case @settings.format
49
- when :text
50
- puts Formatter::Text.format(input, Reader.new.reading(input))
51
- when :html
52
- puts Formatter::HTML.format(input, Reader.new.reading(input))
53
- when :yomikata
54
- puts Formatter::Yomikata.format(input, Reader.new.reading(input))
55
- when :json
56
- puts Formatter::JSON.format(input, Reader.new.reading(input))
57
- end
58
- end
59
- end
60
- end
2
+ require 'furigana/cli'
61
3
 
62
4
  Furigana::CLI.instance.start
@@ -1,7 +1,3 @@
1
- require 'diff/lcs'
2
- require 'nkf'
3
- require 'json'
4
-
5
1
  require 'furigana/version'
6
2
  require 'furigana/mecab'
7
3
  require 'furigana/reader'
@@ -1,44 +1,84 @@
1
- # -*- coding: utf-8 -*-
2
1
  module Furigana
3
2
  module Formatter
4
3
  class Base
5
- class << self
6
- def format(text, tokens)
7
- surface_form, reading = 0, 1
8
- new_text = ''
9
- tokens_enum = tokens.to_enum
10
- begin
11
- current_token = tokens_enum.next
12
- rescue StopIteration
13
- current_token = nil
4
+ SURFACE_FORM, READING = 0, 1
5
+
6
+ def initialize(text, kanji_tokens)
7
+ @text = text
8
+ @kanji_tokens = kanji_tokens
9
+ end
10
+
11
+ def render
12
+ reset
13
+
14
+ @text.each_char do |char|
15
+ if no_more_kanji_tokens?
16
+ @new_text += char
17
+ next
18
+ end
19
+
20
+ @substring += char
21
+
22
+ if not_a_kanji_group_match? char
23
+ @new_text += @substring
24
+ reset_substring
25
+ next
14
26
  end
15
- substring, pos = "", 0
16
-
17
- text.each_char do |char|
18
- if current_token
19
- substring += char
20
- if char == current_token[surface_form][pos]
21
- if pos == current_token[surface_form].length-1
22
- new_text += replacement(current_token[surface_form], current_token[reading])
23
- begin
24
- current_token = tokens_enum.next
25
- rescue StopIteration
26
- current_token = nil
27
- end
28
- substring, pos = "", 0
29
- else
30
- pos += 1
31
- end
32
- else # not a match
33
- new_text += substring
34
- substring, pos = "", 0
35
- end
36
- else # no more tokens
37
- new_text += char
38
- end
27
+
28
+ if kanji_group_match?
29
+ # replace kanji group with formatting
30
+ @new_text += replacement(@current_token[SURFACE_FORM], @current_token[READING])
31
+ @current_token = next_token
32
+ reset_substring
33
+ else
34
+ # kanji token pos advances with char pos
35
+ increment_kanji_char_pos
39
36
  end
40
- new_text
41
37
  end
38
+
39
+ @new_text
40
+ end
41
+
42
+ private
43
+
44
+ def reset
45
+ @new_text = ""
46
+ reset_substring
47
+
48
+ @kanji_tokens_enum = @kanji_tokens.to_enum
49
+ @current_token = next_token
50
+ end
51
+
52
+ def reset_substring
53
+ @substring = ""
54
+ @kanji_char_pos = 0
55
+ end
56
+
57
+ def increment_kanji_char_pos
58
+ @kanji_char_pos += 1
59
+ end
60
+
61
+ def no_more_kanji_tokens?
62
+ @current_token.nil?
63
+ end
64
+
65
+ def not_a_kanji_group_match?(char)
66
+ current_kanji_char = @current_token[SURFACE_FORM][@kanji_char_pos]
67
+ char != current_kanji_char
68
+ end
69
+
70
+ def kanji_group_match?
71
+ @kanji_char_pos == (@current_token[SURFACE_FORM].length - 1)
72
+ end
73
+
74
+ def next_token
75
+ @kanji_tokens_enum.next
76
+ rescue StopIteration
77
+ nil
78
+ end
79
+
80
+ def replacement(surface_form, reading)
81
+ fail NotImplementedError, "#{self.class} must implement `#{__method__}`"
42
82
  end
43
83
  end
44
84
  end
@@ -1,11 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
1
  module Furigana
3
2
  module Formatter
4
3
  class HTML < Formatter::Base
5
- class << self
6
- def replacement(surface_form, reading)
7
- "<ruby><rb>%s</rb><rp>【</rp><rt>%s</rt><rp>】</rp></ruby>" % [surface_form, reading]
8
- end
4
+ def replacement(surface_form, reading)
5
+ "<ruby><rb>%s</rb><rp>【</rp><rt>%s</rt><rp>】</rp></ruby>" % [surface_form, reading]
9
6
  end
10
7
  end
11
8
  end
@@ -1,10 +1,10 @@
1
+ require 'json'
2
+
1
3
  module Furigana
2
4
  module Formatter
3
5
  class JSON < Formatter::Base
4
- class << self
5
- def format(text, tokens)
6
- tokens.to_json
7
- end
6
+ def render
7
+ @kanji_tokens.to_json
8
8
  end
9
9
  end
10
10
  end
@@ -1,11 +1,8 @@
1
- # -*- coding: utf-8 -*-
2
1
  module Furigana
3
2
  module Formatter
4
3
  class Text < Formatter::Base
5
- class << self
6
- def replacement(surface_form, reading)
7
- "%s【%s】" % [surface_form, reading]
8
- end
4
+ def replacement(surface_form, reading)
5
+ "%s【%s】" % [surface_form, reading]
9
6
  end
10
7
  end
11
8
  end
@@ -1,22 +1,21 @@
1
- # -*- coding: utf-8 -*-
1
+ require 'nkf'
2
+
2
3
  module Furigana
3
4
  module Formatter
4
5
  class Yomikata < Formatter::Base
5
- class << self
6
- def replacement(surface_form, reading)
7
- reading
8
- end
6
+ def replacement(surface_form, reading)
7
+ reading
8
+ end
9
9
 
10
- def format(text, tokens)
11
- k2h(super)
12
- end
10
+ def render
11
+ k2h(super)
12
+ end
13
13
 
14
- private
14
+ private
15
15
 
16
- def k2h(k)
17
- return nil if k.nil?
18
- NKF.nkf("-h1 -w", k)
19
- end
16
+ def k2h(k)
17
+ return nil if k.nil?
18
+ NKF.nkf("-h1 -w", k)
20
19
  end
21
20
  end
22
21
  end
@@ -1,32 +1,38 @@
1
- # -*- coding: utf-8 -*-
2
1
  require 'open3'
3
2
 
4
3
  module Furigana
5
4
  class Mecab
6
5
  class << self
7
- def sanitize_text(text)
8
- format("%s\n", text.tr("\n", ""))
9
- end
10
-
11
6
  def tokenize(text)
12
7
  surface_form, reading = 0, 1
13
8
  stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))
14
9
 
15
- # Avoid `ArgumentError - invalid byte sequence in UTF-8`
16
- lines = if stdout.valid_encoding?
17
- stdout.split("\n")
18
- else
19
- stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
20
- stdout.split("\n")
21
- end
10
+ lines = split_stdout(stdout)
11
+
12
+ lines.reduce([]) do |kanji_tokens, line|
13
+ return kanji_tokens if line == 'EOS'
22
14
 
23
- lines.inject([]) do |output, line|
24
15
  columns = line.split("\t")
25
- output << {
16
+ kanji_tokens << {
26
17
  :surface_form => columns[surface_form],
27
18
  :reading => columns[reading]
28
- } if columns[surface_form] != 'EOS'
29
- output
19
+ }
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def sanitize_text(text)
26
+ format("%s\n", text.tr("\n", ""))
27
+ end
28
+
29
+ def split_stdout(stdout)
30
+ # Avoid `ArgumentError - invalid byte sequence in UTF-8`
31
+ if stdout.valid_encoding?
32
+ stdout.split("\n")
33
+ else
34
+ stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
35
+ stdout.split("\n")
30
36
  end
31
37
  end
32
38
  end
@@ -1,10 +1,12 @@
1
- # -*- coding: utf-8 -*-
1
+ require 'diff/lcs'
2
+ require 'nkf'
3
+
2
4
  module Furigana
3
5
  class Reader
4
6
  def reading(text)
5
- Mecab.tokenize(text).inject([]) do |list, token|
6
- with_furigana = add_furigana(token)
7
- list += with_furigana if with_furigana
7
+ Mecab.tokenize(text).reduce([]) do |list, token|
8
+ with_reading = add_reading(token)
9
+ list += with_reading if with_reading
8
10
  list
9
11
  end
10
12
  end
@@ -20,11 +22,12 @@ module Furigana
20
22
  Diff::LCS.sdiff(k2h(token[:surface_form]), k2h(token[:reading]))
21
23
  end
22
24
 
23
- def add_furigana(token)
25
+ def add_reading(token)
24
26
  states = { kanji_and_yomi: '!', yomi: '+', kana: '=' }
25
27
  kanji, yomi = 0, 1
26
28
 
27
29
  list = []
30
+
28
31
  if /\p{Han}/.match(token[:surface_form])
29
32
  on_kanji = false
30
33
  diff_token_surface_form_and_reading(token).each do |part|
@@ -41,6 +44,7 @@ module Furigana
41
44
  end
42
45
  end
43
46
  end
47
+
44
48
  list
45
49
  end
46
50
  end
@@ -1,3 +1,3 @@
1
1
  module Furigana
2
- VERSION = '0.0.6'
2
+ VERSION = '0.0.7'
3
3
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: furigana
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael V. O'Brien
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-25 00:00:00.000000000 Z
11
+ date: 2016-08-17 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: diff-lcs
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: rake
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -39,13 +53,27 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
- name: diff-lcs
56
+ name: pry
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - ">="
46
60
  - !ruby/object:Gem::Version
47
61
  version: '0'
48
- type: :runtime
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pry-doc
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
49
77
  prerelease: false
50
78
  version_requirements: !ruby/object:Gem::Requirement
51
79
  requirements:
@@ -93,7 +121,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
121
  version: '0'
94
122
  requirements: []
95
123
  rubyforge_project:
96
- rubygems_version: 2.2.2
124
+ rubygems_version: 2.4.3
97
125
  signing_key:
98
126
  specification_version: 4
99
127
  summary: Add furigana to text