furigana 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -11,7 +11,16 @@ module Furigana
11
11
  def tokenize(text)
12
12
  surface_form, reading = 0, 1
13
13
  stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))
14
- stdout.split("\n").inject([]) do |output, line|
14
+
15
+ # Avoid `ArgumentError - invalid byte sequence in UTF-8`
16
+ lines = if stdout.valid_encoding?
17
+ stdout.split("\n")
18
+ else
19
+ stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
20
+ stdout.split("\n")
21
+ end
22
+
23
+ lines.inject([]) do |output, line|
15
24
  columns = line.split("\t")
16
25
  output << {
17
26
  :surface_form => columns[surface_form],
@@ -16,12 +16,8 @@ module Furigana
16
16
  NKF.nkf("-h1 -w", k)
17
17
  end
18
18
 
19
- def sdiff(first, second)
20
- Diff::LCS.sdiff(first, second)
21
- end
22
-
23
19
  def diff_token_surface_form_and_reading(token)
24
- sdiff(k2h(token[:surface_form]), k2h(token[:reading]))
20
+ Diff::LCS.sdiff(k2h(token[:surface_form]), k2h(token[:reading]))
25
21
  end
26
22
 
27
23
  def add_furigana(token)
@@ -29,18 +25,20 @@ module Furigana
29
25
  kanji, yomi = 0, 1
30
26
 
31
27
  list = []
32
- on_kanji = false
33
- diff_token_surface_form_and_reading(token).each do |part|
34
- case part.action
35
- when states[:kanji_and_yomi]
36
- list.push ['',''] unless on_kanji
37
- list.last[kanji] += part.old_element
38
- list.last[yomi] += part.new_element
39
- on_kanji = true
40
- when states[:yomi]
41
- list.last[yomi] += part.new_element
42
- when states[:kana]
43
- on_kanji = false
28
+ if /\p{Han}/.match(token[:surface_form])
29
+ on_kanji = false
30
+ diff_token_surface_form_and_reading(token).each do |part|
31
+ case part.action
32
+ when states[:kanji_and_yomi]
33
+ list.push ['',''] unless on_kanji
34
+ list.last[kanji] += part.old_element
35
+ list.last[yomi] += part.new_element
36
+ on_kanji = true
37
+ when states[:yomi]
38
+ list.last[yomi] += part.new_element
39
+ when states[:kana]
40
+ on_kanji = false
41
+ end
44
42
  end
45
43
  end
46
44
  list
@@ -1,3 +1,3 @@
1
1
  module Furigana
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: furigana
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-19 00:00:00.000000000 Z
12
+ date: 2013-01-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake