furigana 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/furigana/mecab.rb +10 -1
- data/lib/furigana/reader.rb +15 -17
- data/lib/furigana/version.rb +1 -1
- metadata +2 -2
data/lib/furigana/mecab.rb
CHANGED
@@ -11,7 +11,16 @@ module Furigana
|
|
11
11
|
def tokenize(text)
|
12
12
|
surface_form, reading = 0, 1
|
13
13
|
stdout, stderr, status = Open3.capture3("mecab -Ochasen", :stdin_data => sanitize_text(text))
|
14
|
-
|
14
|
+
|
15
|
+
# Avoid `ArgumentError - invalid byte sequence in UTF-8`
|
16
|
+
lines = if stdout.valid_encoding?
|
17
|
+
stdout.split("\n")
|
18
|
+
else
|
19
|
+
stdout.encode!("UTF-8", "UTF-8", :invalid => :replace, :undef => :replace, :replace => "�")
|
20
|
+
stdout.split("\n")
|
21
|
+
end
|
22
|
+
|
23
|
+
lines.inject([]) do |output, line|
|
15
24
|
columns = line.split("\t")
|
16
25
|
output << {
|
17
26
|
:surface_form => columns[surface_form],
|
data/lib/furigana/reader.rb
CHANGED
@@ -16,12 +16,8 @@ module Furigana
|
|
16
16
|
NKF.nkf("-h1 -w", k)
|
17
17
|
end
|
18
18
|
|
19
|
-
def sdiff(first, second)
|
20
|
-
Diff::LCS.sdiff(first, second)
|
21
|
-
end
|
22
|
-
|
23
19
|
def diff_token_surface_form_and_reading(token)
|
24
|
-
sdiff(k2h(token[:surface_form]), k2h(token[:reading]))
|
20
|
+
Diff::LCS.sdiff(k2h(token[:surface_form]), k2h(token[:reading]))
|
25
21
|
end
|
26
22
|
|
27
23
|
def add_furigana(token)
|
@@ -29,18 +25,20 @@ module Furigana
|
|
29
25
|
kanji, yomi = 0, 1
|
30
26
|
|
31
27
|
list = []
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
28
|
+
if /\p{Han}/.match(token[:surface_form])
|
29
|
+
on_kanji = false
|
30
|
+
diff_token_surface_form_and_reading(token).each do |part|
|
31
|
+
case part.action
|
32
|
+
when states[:kanji_and_yomi]
|
33
|
+
list.push ['',''] unless on_kanji
|
34
|
+
list.last[kanji] += part.old_element
|
35
|
+
list.last[yomi] += part.new_element
|
36
|
+
on_kanji = true
|
37
|
+
when states[:yomi]
|
38
|
+
list.last[yomi] += part.new_element
|
39
|
+
when states[:kana]
|
40
|
+
on_kanji = false
|
41
|
+
end
|
44
42
|
end
|
45
43
|
end
|
46
44
|
list
|
data/lib/furigana/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: furigana
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-01-
|
12
|
+
date: 2013-01-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|