ruby-pinyin-ez 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +10 -0
- data/README.markdown +100 -0
- data/lib/ruby-pinyin/backend/ezseg.rb +101 -0
- data/lib/ruby-pinyin/backend/mmseg.rb +110 -0
- data/lib/ruby-pinyin/backend/simple.rb +72 -0
- data/lib/ruby-pinyin/backend.rb +7 -0
- data/lib/ruby-pinyin/data/Mandarin.dat +41208 -0
- data/lib/ruby-pinyin/data/Punctuations.dat +14 -0
- data/lib/ruby-pinyin/data/words.dat +175180 -0
- data/lib/ruby-pinyin/data/words.dic +175180 -0
- data/lib/ruby-pinyin/punctuation.rb +46 -0
- data/lib/ruby-pinyin/util.rb +29 -0
- data/lib/ruby-pinyin/value.rb +16 -0
- data/lib/ruby-pinyin/version.rb +3 -0
- data/lib/ruby-pinyin.rb +41 -0
- metadata +87 -0
@@ -0,0 +1,46 @@
|
|
1
|
+
module PinYin
|
2
|
+
module Punctuation
|
3
|
+
|
4
|
+
class <<self
|
5
|
+
|
6
|
+
def regexp
|
7
|
+
return @regexp if @regexp
|
8
|
+
|
9
|
+
escaped_punctuations = punctuations.values.map {|v| "\\#{[v].pack('H*')}"}.join
|
10
|
+
@regexp = Regexp.new "([#{escaped_punctuations}]+)$"
|
11
|
+
@regexp
|
12
|
+
end
|
13
|
+
|
14
|
+
def chinese_regexp
|
15
|
+
@chinese_regexp ||= /([\u3000-\u303F\uFF00-\uFFEF]+)/
|
16
|
+
end
|
17
|
+
|
18
|
+
def [](code)
|
19
|
+
punctuations[code]
|
20
|
+
end
|
21
|
+
|
22
|
+
def include?(code)
|
23
|
+
punctuations.has_key?(code)
|
24
|
+
end
|
25
|
+
|
26
|
+
def punctuations
|
27
|
+
return @punctuations if @punctuations
|
28
|
+
|
29
|
+
@punctuations = {}
|
30
|
+
src = File.expand_path('../data/Punctuations.dat', __FILE__)
|
31
|
+
load_from src
|
32
|
+
|
33
|
+
@punctuations
|
34
|
+
end
|
35
|
+
|
36
|
+
def load_from(file)
|
37
|
+
File.readlines(file).map do |line|
|
38
|
+
from, to = line.split(/\s+/)
|
39
|
+
@punctuations[from] = to
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
module PinYin
|
4
|
+
module Util
|
5
|
+
extend self
|
6
|
+
|
7
|
+
ASCIIMapping = {
|
8
|
+
'üē' => ['ue', 1], 'üé' => ['ue', 2], 'üě' => ['ue', 3], 'üè' => ['ue', 4],
|
9
|
+
'ā' => ['a', 1], 'ē' => ['e', 1], 'ī' => ['i', 1], 'ō' => ['o', 1], 'ū' => ['u', 1], 'ǖ' => ['v', 1],
|
10
|
+
'á' => ['a', 2], 'é' => ['e', 2], 'í' => ['i', 2], 'ó' => ['o', 2], 'ú' => ['u', 2], 'ǘ' => ['v', 2],
|
11
|
+
'ǎ' => ['a', 3], 'ě' => ['e', 3], 'ǐ' => ['i', 3], 'ǒ' => ['o', 3], 'ǔ' => ['u', 3], 'ǚ' => ['v', 3],
|
12
|
+
'à' => ['a', 4], 'è' => ['e', 4], 'ì' => ['i', 4], 'ò' => ['o', 4], 'ù' => ['u', 4], 'ǜ' => ['v', 4]
|
13
|
+
}
|
14
|
+
|
15
|
+
def to_ascii(reading, with_tone=true)
|
16
|
+
ASCIIMapping.each do |char, (ascii, tone)|
|
17
|
+
if reading.include? char
|
18
|
+
if with_tone
|
19
|
+
return reading.sub(char, ascii).concat(tone.to_s)
|
20
|
+
else
|
21
|
+
return reading.sub(char, ascii)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
reading
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module PinYin
|
2
|
+
class Value < String
|
3
|
+
attr_accessor :english
|
4
|
+
alias :english? :english
|
5
|
+
|
6
|
+
def initialize(str, english=true)
|
7
|
+
super(str)
|
8
|
+
self.english = english
|
9
|
+
end
|
10
|
+
|
11
|
+
def split(*args)
|
12
|
+
result = super
|
13
|
+
result.map {|str| self.class.new(str, english)}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/ruby-pinyin.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'ruby-pinyin/util'
|
2
|
+
require 'ruby-pinyin/value'
|
3
|
+
require 'ruby-pinyin/punctuation'
|
4
|
+
require 'ruby-pinyin/backend'
|
5
|
+
|
6
|
+
module PinYin
|
7
|
+
class <<self
|
8
|
+
|
9
|
+
attr_accessor :backend
|
10
|
+
|
11
|
+
def romanize(str, tone=nil, include_punctuations=false)
|
12
|
+
backend.romanize(str, tone, include_punctuations)
|
13
|
+
end
|
14
|
+
alias :of_string :romanize
|
15
|
+
|
16
|
+
def permlink(str, sep='-')
|
17
|
+
of_string(str).join(sep)
|
18
|
+
end
|
19
|
+
|
20
|
+
def abbr(str, except_lead=false, except_english=true)
|
21
|
+
result = ""
|
22
|
+
of_string(str).each_with_index do |word, i|
|
23
|
+
w = (except_lead && i == 0) || (except_english && word.english?) ? word : word[0]
|
24
|
+
result << w
|
25
|
+
end
|
26
|
+
result
|
27
|
+
end
|
28
|
+
|
29
|
+
def sentence(str, tone=nil)
|
30
|
+
of_string(str, tone, true).join(' ')
|
31
|
+
end
|
32
|
+
|
33
|
+
def override_files=(files)
|
34
|
+
klass = backend ? backend.class : PinYin::Backend::MMSeg
|
35
|
+
self.backend = klass.new files
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
PinYin.backend = PinYin::Backend::EZSeg.new
|
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby-pinyin-ez
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.5.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- hzyhzy
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-02-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rmmseg-cpp-new
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.3.1
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.3.1
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: minitest
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '5.4'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '5.4'
|
41
|
+
description: Pinyin is a romanization system (phonemic notation) of Chinese characters,
|
42
|
+
this gem helps you to convert Chinese characters into pinyin form.
|
43
|
+
email:
|
44
|
+
- 18670314023@163.com
|
45
|
+
executables: []
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- LICENSE
|
50
|
+
- README.markdown
|
51
|
+
- lib/ruby-pinyin.rb
|
52
|
+
- lib/ruby-pinyin/backend.rb
|
53
|
+
- lib/ruby-pinyin/backend/ezseg.rb
|
54
|
+
- lib/ruby-pinyin/backend/mmseg.rb
|
55
|
+
- lib/ruby-pinyin/backend/simple.rb
|
56
|
+
- lib/ruby-pinyin/data/Mandarin.dat
|
57
|
+
- lib/ruby-pinyin/data/Punctuations.dat
|
58
|
+
- lib/ruby-pinyin/data/words.dat
|
59
|
+
- lib/ruby-pinyin/data/words.dic
|
60
|
+
- lib/ruby-pinyin/punctuation.rb
|
61
|
+
- lib/ruby-pinyin/util.rb
|
62
|
+
- lib/ruby-pinyin/value.rb
|
63
|
+
- lib/ruby-pinyin/version.rb
|
64
|
+
homepage: https://github.com/hzyhzy/ruby-pinyin
|
65
|
+
licenses:
|
66
|
+
- BSD
|
67
|
+
metadata: {}
|
68
|
+
post_install_message:
|
69
|
+
rdoc_options: []
|
70
|
+
require_paths:
|
71
|
+
- lib
|
72
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0'
|
82
|
+
requirements: []
|
83
|
+
rubygems_version: 3.0.6
|
84
|
+
signing_key:
|
85
|
+
specification_version: 4
|
86
|
+
summary: Convert Chinese characters into pinyin.
|
87
|
+
test_files: []
|