furigana 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,16 @@ module Furigana
11
11
  def tokenize(text)
12
12
  surface_form, reading = 0, 1
13
13
  stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))
14
- stdout.split("\n").inject([]) do |output, line|
14
+
15
+ # Avoid `ArgumentError - invalid byte sequence in UTF-8`
16
+ lines = if stdout.valid_encoding?
17
+ stdout.split("\n")
18
+ else
19
+ stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
20
+ stdout.split("\n")
21
+ end
22
+
23
+ lines.inject([]) do |output, line|
15
24
  columns = line.split("\t")
16
25
  output << {
17
26
  :surface_form => columns[surface_form],
@@ -16,12 +16,8 @@ module Furigana
16
16
  NKF.nkf("-h1 -w", k)
17
17
  end
18
18
 
19
- def sdiff(first, second)
20
- Diff::LCS.sdiff(first, second)
21
- end
22
-
23
19
  def diff_token_surface_form_and_reading(token)
24
- sdiff(k2h(token[:surface_form]), k2h(token[:reading]))
20
+ Diff::LCS.sdiff(k2h(token[:surface_form]), k2h(token[:reading]))
25
21
  end
26
22
 
27
23
  def add_furigana(token)
@@ -29,18 +25,20 @@ module Furigana
29
25
  kanji, yomi = 0, 1
30
26
 
31
27
  list = []
32
- on_kanji = false
33
- diff_token_surface_form_and_reading(token).each do |part|
34
- case part.action
35
- when states[:kanji_and_yomi]
36
- list.push ['',''] unless on_kanji
37
- list.last[kanji] += part.old_element
38
- list.last[yomi] += part.new_element
39
- on_kanji = true
40
- when states[:yomi]
41
- list.last[yomi] += part.new_element
42
- when states[:kana]
43
- on_kanji = false
28
+ if /\p{Han}/.match(token[:surface_form])
29
+ on_kanji = false
30
+ diff_token_surface_form_and_reading(token).each do |part|
31
+ case part.action
32
+ when states[:kanji_and_yomi]
33
+ list.push ['',''] unless on_kanji
34
+ list.last[kanji] += part.old_element
35
+ list.last[yomi] += part.new_element
36
+ on_kanji = true
37
+ when states[:yomi]
38
+ list.last[yomi] += part.new_element
39
+ when states[:kana]
40
+ on_kanji = false
41
+ end
44
42
  end
45
43
  end
46
44
  list
@@ -1,3 +1,3 @@
1
1
  module Furigana
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: furigana
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-19 00:00:00.000000000 Z
12
+ date: 2013-01-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake