ting 0.11.0 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGELOG.md +9 -2
- data/Gemfile.lock +2 -2
- data/lib/ting.rb +18 -3
- data/lib/ting/groundwork.rb +2 -0
- data/lib/ting/hanyu_pinyin_parser.rb +43 -10
- data/lib/ting/tones/accents.rb +1 -1
- data/lib/ting/version.rb +1 -1
- data/spec/hanyu_pinyin_parser_spec.rb +110 -0
- data/spec/ting_spec.rb +13 -2
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6e563c103a40efc2e917643626b84bd6f46f72ab054d9c8c090b1fca418f5226
|
4
|
+
data.tar.gz: 79b68c136dfc2bc4b6424f2cd8ac09f861f0ca4df6bd86f54fea24b6bf5c2da1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28b3a045d027467fd22f47c20835bcc1576809b40c6e239b0a40bfc965deb5b4c2ac554d4050758268b8b7bcf063b16a94aa87f85b4a568366824cab9ecf6695
|
7
|
+
data.tar.gz: 32f06a2130fb4d530ae63084142ceb82bb54047adaa9ea81cb61691401548342f62b54244e19529996c735c448cb414773c7d793c2859a5550ebcdbae167075f
|
data/CHANGELOG.md
CHANGED
@@ -9,16 +9,23 @@ All notable changes to this project will be documented in this file. This change
|
|
9
9
|
|
10
10
|
## [Unreleased]
|
11
11
|
|
12
|
+
## [0.12.0] - released 2018-10-18
|
13
|
+
|
14
|
+
### Changed
|
15
|
+
- `Ting.pretty_tones` inserts apostrophes before syllables beginning with a, e, o
|
16
|
+
- `Ting::HanyuPinyinParser` is better at splitting up compound words correctly
|
17
|
+
|
12
18
|
## [0.11.0] - released 2017-11-06
|
13
19
|
|
14
20
|
### Changed
|
15
21
|
- Make `Ting.pretty_tones` work with bopomofo
|
16
|
-
- Correct name of Palladius (
|
22
|
+
- Correct name of Palladius (Cyrillic transcription)
|
17
23
|
- Add missing IPA finals
|
18
24
|
- Change Palladius transcription of "hui" to хуэй
|
19
25
|
|
20
26
|
### Added
|
21
27
|
- `bin/ting_table` script
|
22
28
|
|
23
|
-
[Unreleased]: https://github.com/lambdaisland/uri/compare/v0.
|
29
|
+
[Unreleased]: https://github.com/lambdaisland/uri/compare/v0.12.0...HEAD
|
30
|
+
[0.12.0]: https://github.com/plexus/ting/compare/v0.12.0...v0.11.0
|
24
31
|
[0.11.0]: https://github.com/plexus/ting/compare/v0.11.0...v0.10.0
|
data/Gemfile.lock
CHANGED
data/lib/ting.rb
CHANGED
@@ -50,15 +50,30 @@ module Ting
|
|
50
50
|
)
|
51
51
|
end
|
52
52
|
|
53
|
+
# The longest syllables are six letters long (chuang, shuang, zhuang).
|
54
|
+
SYLLABLE_REGEXP = /[A-Za-züÜ]{1,6}\d?/
|
53
55
|
|
54
56
|
def pretty_tones(string)
|
55
|
-
string.gsub('u:','ü')
|
56
|
-
|
57
|
+
string = string.gsub('u:', 'ü') # (note that this implicitly dups the string)
|
58
|
+
# Scan through the string, replacing syllable by syllable.
|
59
|
+
pos = 0
|
60
|
+
while match = string.match(SYLLABLE_REGEXP, pos)
|
61
|
+
syllable = match[0]
|
62
|
+
replacement = SYLLABLE_CACHE[syllable]
|
63
|
+
match_pos = match.begin(0)
|
64
|
+
# If this syllable starts with a vowel and is preceded by a letter (not whitespace or
|
65
|
+
# control characters), insert an apostrophe before it.
|
66
|
+
if match_pos > 0 && string[match_pos - 1] =~ /[[:alpha:]]/ && syllable =~ /^[AEOaoe]/
|
67
|
+
replacement = "'" + replacement
|
68
|
+
end
|
69
|
+
string[match_pos, syllable.length] = replacement
|
70
|
+
pos = match_pos + replacement.length
|
57
71
|
end
|
72
|
+
string
|
58
73
|
end
|
59
74
|
|
60
75
|
def bpmf(string)
|
61
|
-
string.gsub('u:','ü').scan(
|
76
|
+
string.gsub('u:', 'ü').scan(SYLLABLE_REGEXP).map do |m|
|
62
77
|
Ting.writer(:zhuyin, :marks).(
|
63
78
|
Ting.reader(:hanyu, :numbers).(m.downcase)
|
64
79
|
)
|
data/lib/ting/groundwork.rb
CHANGED
@@ -14,26 +14,59 @@ module Ting
|
|
14
14
|
@all_syllables ||= Ting.all_syllables.map(&hanyu_writer).sort_by(&:length).reverse
|
15
15
|
end
|
16
16
|
|
17
|
-
def
|
18
|
-
@
|
17
|
+
def consonant_syllables
|
18
|
+
@consonant_syllables ||= all_syllables.grep(/^[bcdfghjklmnpqrstwxyz]/i)
|
19
19
|
end
|
20
20
|
|
21
21
|
def pinyin_regexp
|
22
|
-
|
22
|
+
# This will parse a cluster of pinyin, i.e. an uninterrupted string of pinyin characters without punctuation.
|
23
|
+
@pinyin_cluster_regexp ||= /\A
|
24
|
+
# Every syllable can appear at the start of a cluster.
|
25
|
+
(#{Regexp.union(all_syllables)})
|
26
|
+
# However, only syllables starting with a consonant can follow, as syllables starting with a vowel have to
|
27
|
+
# be prefixed with an apostrophe.
|
28
|
+
# Since it is common to omit the apostrophe when there is no ambiguity, also allow syllables starting with
|
29
|
+
# a vowel after all letters except n and g, and after -ong, since -on does not appear at the end of a valid
|
30
|
+
# syllable.
|
31
|
+
(#{Regexp.union(consonant_syllables)}|(?<=[^ng]|[ōóǒòo]ng)#{Regexp.union(all_syllables)})*
|
32
|
+
(r)?
|
33
|
+
\Z/x
|
23
34
|
end
|
24
35
|
|
25
|
-
def
|
26
|
-
pinyin.
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
36
|
+
def pinyin_separator_regexp
|
37
|
+
# A regular expression that matches every character that can *not* appear in pinyin.
|
38
|
+
@pinyin_separator_regexp ||= Regexp.new("[^#{all_syllables.join.downcase.split("").sort.uniq.join}]+")
|
39
|
+
end
|
40
|
+
|
41
|
+
def parse_cluster(pinyin)
|
42
|
+
syllables = []
|
43
|
+
|
44
|
+
# Chop off one syllable at a time from the end by continuously matching the same regular expression.
|
45
|
+
# This ensures the pinyin will be split into only valid pinyin syllables. Because a match capture will
|
46
|
+
# only contain the *last* content it has matched, we have to use a loop.
|
47
|
+
while match = pinyin_regexp.match(pinyin)
|
48
|
+
# If an 'r' at the end was matched, this implies that all other parts of the string were matched as
|
49
|
+
# syllables, and this cluster uses erhua.
|
50
|
+
if 'r' == match[3]
|
51
|
+
syllables << 'er'
|
52
|
+
pinyin = pinyin.chop
|
31
53
|
end
|
54
|
+
last_syllable = match[2] || match[1]
|
55
|
+
syllables << last_syllable
|
56
|
+
pinyin = pinyin[0, pinyin.length - last_syllable.length]
|
32
57
|
end
|
58
|
+
|
59
|
+
raise ArgumentError, "Unparseable pinyin fragment encountered: #{pinyin}" if !pinyin.empty?
|
60
|
+
|
61
|
+
syllables.reverse
|
33
62
|
end
|
34
63
|
|
35
64
|
def parse(pinyin)
|
36
|
-
|
65
|
+
# hanyu_reader cannot parse uppercase pinyin.
|
66
|
+
pinyin = pinyin.downcase
|
67
|
+
|
68
|
+
clusters = pinyin.split(pinyin_separator_regexp)
|
69
|
+
clusters.flat_map {|cluster| parse_cluster(cluster)}.flat_map(&hanyu_reader)
|
37
70
|
end
|
38
71
|
alias call parse
|
39
72
|
|
data/lib/ting/tones/accents.rb
CHANGED
data/lib/ting/version.rb
CHANGED
@@ -0,0 +1,110 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Ting::HanyuPinyinParser do
|
5
|
+
let(:parser) { Ting::HanyuPinyinParser.new }
|
6
|
+
|
7
|
+
it 'should be able to parse boring characters' do
|
8
|
+
pinyin = "xíbié de hǎi'àn"
|
9
|
+
expect(parser.parse(pinyin)).to eq([
|
10
|
+
Ting::Syllable.new( Ting::Initial::Xi, Ting::Final::I, 2 ),
|
11
|
+
Ting::Syllable.new( Ting::Initial::Bo, Ting::Final::Ie, 2 ),
|
12
|
+
Ting::Syllable.new( Ting::Initial::De, Ting::Final::E, 5 ),
|
13
|
+
Ting::Syllable.new( Ting::Initial::He, Ting::Final::Ai, 3 ),
|
14
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 4 ),
|
15
|
+
])
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should be able to parse erhua' do
|
19
|
+
pinyin = "wèir Wèir"
|
20
|
+
expect(parser.parse(pinyin)).to eq([
|
21
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ui, 4 ),
|
22
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
|
23
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ui, 4 ),
|
24
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
|
25
|
+
])
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should be able to discern erhua from other Ri syllables' do
|
29
|
+
pinyin = Ting.pretty_tones 'yang2rou4'
|
30
|
+
expect(parser.parse(pinyin)).to eq([
|
31
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Iang, 2 ),
|
32
|
+
Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::Ou, 4 ),
|
33
|
+
])
|
34
|
+
|
35
|
+
pinyin = Ting.pretty_tones 'sui1ran2'
|
36
|
+
expect(parser.parse(pinyin)).to eq([
|
37
|
+
Ting::Syllable.new( Ting::Initial::Si, Ting::Final::Ui, 1 ),
|
38
|
+
Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::An, 2 ),
|
39
|
+
])
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should parse er4 and her2 correctly' do
|
43
|
+
expect(parser.parse('èr')).to eq([
|
44
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 4 ),
|
45
|
+
])
|
46
|
+
expect(parser.parse('hér')).to eq([
|
47
|
+
Ting::Syllable.new( Ting::Initial::He, Ting::Final::E, 2 ),
|
48
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
|
49
|
+
])
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should parse ou1zhou1 correctly' do
|
53
|
+
pinyin = Ting.pretty_tones('ou1zhou1')
|
54
|
+
expect(parser.parse(pinyin)).to eq([
|
55
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ou, 1 ),
|
56
|
+
Ting::Syllable.new( Ting::Initial::Zhi, Ting::Final::Ou, 1 ),
|
57
|
+
])
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'should parse sheng3lve4 correctly' do
|
61
|
+
expect(parser.parse('shěnglüè')).to eq([
|
62
|
+
Ting::Syllable.new( Ting::Initial::Shi, Ting::Final::Eng, 3 ),
|
63
|
+
Ting::Syllable.new( Ting::Initial::Le, Ting::Final::Ue, 4 ),
|
64
|
+
])
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should parse regardless of apostrophes and weird whitespace' do
|
68
|
+
pinyin = "Xī'ān\thǎowánr\tma?\nHǎowánr!"
|
69
|
+
expect(parser.parse(pinyin).map(&:tone)).to eq([1, 1, 3, 2, 5, 5, 3, 2, 5])
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'should parse ambiguous syllables based on context' do
|
73
|
+
pinyin = 'gūnánguǎnǚ'
|
74
|
+
expect(parser.parse(pinyin)).to eq([
|
75
|
+
Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::U, 1 ),
|
76
|
+
Ting::Syllable.new( Ting::Initial::Ne, Ting::Final::An, 2 ),
|
77
|
+
Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::Ua, 3 ),
|
78
|
+
Ting::Syllable.new( Ting::Initial::Ne, Ting::Final::V, 3 ),
|
79
|
+
])
|
80
|
+
|
81
|
+
pinyin = 'yángròu'
|
82
|
+
expect(parser.parse(pinyin)).to eq([
|
83
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Iang, 2 ),
|
84
|
+
Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::Ou, 4 ),
|
85
|
+
])
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should parse some invalid pinyin (missing apostrophe)' do
|
89
|
+
# Syllables that begin with [aeo] must be prefixed with an apostrophe in the middle of the word.
|
90
|
+
# Ref.: https://en.wikipedia.org/wiki/Pinyin#Pronunciation_of_initials, "Note on the apostrophe"
|
91
|
+
# Still, Ting should be able to parse these syllables if they follow unambiguous characters.
|
92
|
+
|
93
|
+
expect(parser.parse('hǎiàn')).to eq([
|
94
|
+
Ting::Syllable.new( Ting::Initial::He, Ting::Final::Ai, 3 ),
|
95
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 4 ),
|
96
|
+
])
|
97
|
+
|
98
|
+
expect(parser.parse('mòshuǐer')).to eq([
|
99
|
+
Ting::Syllable.new( Ting::Initial::Mo, Ting::Final::O, 4 ),
|
100
|
+
Ting::Syllable.new( Ting::Initial::Shi, Ting::Final::Ui, 3 ),
|
101
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
|
102
|
+
])
|
103
|
+
|
104
|
+
expect(parser.parse('gōngānjú')).to eq([
|
105
|
+
Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::Ong, 1 ),
|
106
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 1 ),
|
107
|
+
Ting::Syllable.new( Ting::Initial::Ji, Ting::Final::V, 2 ),
|
108
|
+
])
|
109
|
+
end
|
110
|
+
end
|
data/spec/ting_spec.rb
CHANGED
@@ -18,7 +18,18 @@ describe Ting do
|
|
18
18
|
end
|
19
19
|
|
20
20
|
it 'should parse syllables correctly' do
|
21
|
-
expect(Ting.pretty_tones('
|
22
|
-
expect(Ting.bpmf('
|
21
|
+
expect(Ting.pretty_tones('Wo3 de Ou1zhou1 peng2you3 hen3 zhuang4')).to eq('wǒ de ōuzhōu péngyǒu hěn zhuàng')
|
22
|
+
expect(Ting.bpmf('Wo3 de peng2you3 hen3 zhuang4')).to eq('ㄨㄛˇ ㄉㄜ˙ ㄆㄥˊ ㄧㄡˇ ㄏㄣˇ ㄓㄨㄤˋ')
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should be able to pretty-print simple strings' do
|
26
|
+
expect(Ting.pretty_tones('wo3 ai4 ni3')).to eq('wǒ ài nǐ')
|
27
|
+
expect(Ting.pretty_tones('you3dian3r hao3xiao4')).to eq('yǒudiǎnr hǎoxiào')
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should insert apostrophes when appropriate' do
|
31
|
+
expect(Ting.pretty_tones('hai3an4')).to eq("hǎi'àn")
|
32
|
+
expect(Ting.pretty_tones('ding4e2')).to eq("dìng'é")
|
33
|
+
expect(Ting.pretty_tones('an5an5an5an5an')).to eq("an'an'an'an'an")
|
23
34
|
end
|
24
35
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ting
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arne Brasseur
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- lib/ting/tones/supernum.rb
|
105
105
|
- lib/ting/version.rb
|
106
106
|
- lib/ting/writer.rb
|
107
|
+
- spec/hanyu_pinyin_parser_spec.rb
|
107
108
|
- spec/jruby_csv_spec.rb
|
108
109
|
- spec/palladius_spec.rb
|
109
110
|
- spec/spec_helper.rb
|
@@ -132,12 +133,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
132
133
|
version: '0'
|
133
134
|
requirements: []
|
134
135
|
rubyforge_project:
|
135
|
-
rubygems_version: 2.6
|
136
|
+
rubygems_version: 2.7.6
|
136
137
|
signing_key:
|
137
138
|
specification_version: 4
|
138
139
|
summary: A conversion library for Chinese transcription methods like Hanyu Pinyin,
|
139
140
|
Bopomofo and Wade-Giles.
|
140
141
|
test_files:
|
142
|
+
- spec/hanyu_pinyin_parser_spec.rb
|
141
143
|
- spec/jruby_csv_spec.rb
|
142
144
|
- spec/palladius_spec.rb
|
143
145
|
- spec/spec_helper.rb
|