ting 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG.md +9 -2
- data/Gemfile.lock +2 -2
- data/lib/ting.rb +18 -3
- data/lib/ting/groundwork.rb +2 -0
- data/lib/ting/hanyu_pinyin_parser.rb +43 -10
- data/lib/ting/tones/accents.rb +1 -1
- data/lib/ting/version.rb +1 -1
- data/spec/hanyu_pinyin_parser_spec.rb +110 -0
- data/spec/ting_spec.rb +13 -2
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6e563c103a40efc2e917643626b84bd6f46f72ab054d9c8c090b1fca418f5226
|
4
|
+
data.tar.gz: 79b68c136dfc2bc4b6424f2cd8ac09f861f0ca4df6bd86f54fea24b6bf5c2da1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28b3a045d027467fd22f47c20835bcc1576809b40c6e239b0a40bfc965deb5b4c2ac554d4050758268b8b7bcf063b16a94aa87f85b4a568366824cab9ecf6695
|
7
|
+
data.tar.gz: 32f06a2130fb4d530ae63084142ceb82bb54047adaa9ea81cb61691401548342f62b54244e19529996c735c448cb414773c7d793c2859a5550ebcdbae167075f
|
data/CHANGELOG.md
CHANGED
@@ -9,16 +9,23 @@ All notable changes to this project will be documented in this file. This change
|
|
9
9
|
|
10
10
|
## [Unreleased]
|
11
11
|
|
12
|
+
## [0.12.0] - released 2018-10-18
|
13
|
+
|
14
|
+
### Changed
|
15
|
+
- `Ting.pretty_tones` inserts apostrophes before syllables beginning with a, e, o
|
16
|
+
- `Ting::HanyuPinyinParser` is better at splitting up compound words correctly
|
17
|
+
|
12
18
|
## [0.11.0] - released 2017-11-06
|
13
19
|
|
14
20
|
### Changed
|
15
21
|
- Make `Ting.pretty_tones` work with bopomofo
|
16
|
-
- Correct name of Palladius (
|
22
|
+
- Correct name of Palladius (Cyrillic transcription)
|
17
23
|
- Add missing IPA finals
|
18
24
|
- Change Palladius transcription of "hui" to хуэй
|
19
25
|
|
20
26
|
### Added
|
21
27
|
- `bin/ting_table` script
|
22
28
|
|
23
|
-
[Unreleased]: https://github.com/lambdaisland/uri/compare/v0.
|
29
|
+
[Unreleased]: https://github.com/lambdaisland/uri/compare/v0.12.0...HEAD
|
30
|
+
[0.12.0]: https://github.com/plexus/ting/compare/v0.12.0...v0.11.0
|
24
31
|
[0.11.0]: https://github.com/plexus/ting/compare/v0.11.0...v0.10.0
|
data/Gemfile.lock
CHANGED
data/lib/ting.rb
CHANGED
@@ -50,15 +50,30 @@ module Ting
|
|
50
50
|
)
|
51
51
|
end
|
52
52
|
|
53
|
+
# The longest syllables are six letters long (chuang, shuang, zhuang).
|
54
|
+
SYLLABLE_REGEXP = /[A-Za-züÜ]{1,6}\d?/
|
53
55
|
|
54
56
|
def pretty_tones(string)
|
55
|
-
string.gsub('u:','ü')
|
56
|
-
|
57
|
+
string = string.gsub('u:', 'ü') # (note that this implicitly dups the string)
|
58
|
+
# Scan through the string, replacing syllable by syllable.
|
59
|
+
pos = 0
|
60
|
+
while match = string.match(SYLLABLE_REGEXP, pos)
|
61
|
+
syllable = match[0]
|
62
|
+
replacement = SYLLABLE_CACHE[syllable]
|
63
|
+
match_pos = match.begin(0)
|
64
|
+
# If this syllable starts with a vowel and is preceded by a letter (not whitespace or
|
65
|
+
# control characters), insert an apostrophe before it.
|
66
|
+
if match_pos > 0 && string[match_pos - 1] =~ /[[:alpha:]]/ && syllable =~ /^[AEOaoe]/
|
67
|
+
replacement = "'" + replacement
|
68
|
+
end
|
69
|
+
string[match_pos, syllable.length] = replacement
|
70
|
+
pos = match_pos + replacement.length
|
57
71
|
end
|
72
|
+
string
|
58
73
|
end
|
59
74
|
|
60
75
|
def bpmf(string)
|
61
|
-
string.gsub('u:','ü').scan(
|
76
|
+
string.gsub('u:', 'ü').scan(SYLLABLE_REGEXP).map do |m|
|
62
77
|
Ting.writer(:zhuyin, :marks).(
|
63
78
|
Ting.reader(:hanyu, :numbers).(m.downcase)
|
64
79
|
)
|
data/lib/ting/groundwork.rb
CHANGED
@@ -14,26 +14,59 @@ module Ting
|
|
14
14
|
@all_syllables ||= Ting.all_syllables.map(&hanyu_writer).sort_by(&:length).reverse
|
15
15
|
end
|
16
16
|
|
17
|
-
def
|
18
|
-
@
|
17
|
+
def consonant_syllables
|
18
|
+
@consonant_syllables ||= all_syllables.grep(/^[bcdfghjklmnpqrstwxyz]/i)
|
19
19
|
end
|
20
20
|
|
21
21
|
def pinyin_regexp
|
22
|
-
|
22
|
+
# This will parse a cluster of pinyin, i.e. an uninterrupted string of pinyin characters without punctuation.
|
23
|
+
@pinyin_cluster_regexp ||= /\A
|
24
|
+
# Every syllable can appear at the start of a cluster.
|
25
|
+
(#{Regexp.union(all_syllables)})
|
26
|
+
# However, only syllables starting with a consonant can follow, as syllables starting with a vowel have to
|
27
|
+
# be prefixed with an apostrophe.
|
28
|
+
# Since it is common to omit the apostrophe when there is no ambiguity, also allow syllables starting with
|
29
|
+
# a vowel after all letters except n and g, and after -ong, since -on does not appear at the end of a valid
|
30
|
+
# syllable.
|
31
|
+
(#{Regexp.union(consonant_syllables)}|(?<=[^ng]|[ōóǒòo]ng)#{Regexp.union(all_syllables)})*
|
32
|
+
(r)?
|
33
|
+
\Z/x
|
23
34
|
end
|
24
35
|
|
25
|
-
def
|
26
|
-
pinyin.
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
36
|
+
def pinyin_separator_regexp
|
37
|
+
# A regular expression that matches every character that can *not* appear in pinyin.
|
38
|
+
@pinyin_separator_regexp ||= Regexp.new("[^#{all_syllables.join.downcase.split("").sort.uniq.join}]+")
|
39
|
+
end
|
40
|
+
|
41
|
+
def parse_cluster(pinyin)
|
42
|
+
syllables = []
|
43
|
+
|
44
|
+
# Chop off one syllable at a time from the end by continuously matching the same regular expression.
|
45
|
+
# This ensures the pinyin will be split into only valid pinyin syllables. Because a match capture will
|
46
|
+
# only contain the *last* content it has matched, we have to use a loop.
|
47
|
+
while match = pinyin_regexp.match(pinyin)
|
48
|
+
# If an 'r' at the end was matched, this implies that all other parts of the string were matched as
|
49
|
+
# syllables, and this cluster uses erhua.
|
50
|
+
if 'r' == match[3]
|
51
|
+
syllables << 'er'
|
52
|
+
pinyin = pinyin.chop
|
31
53
|
end
|
54
|
+
last_syllable = match[2] || match[1]
|
55
|
+
syllables << last_syllable
|
56
|
+
pinyin = pinyin[0, pinyin.length - last_syllable.length]
|
32
57
|
end
|
58
|
+
|
59
|
+
raise ArgumentError, "Unparseable pinyin fragment encountered: #{pinyin}" if !pinyin.empty?
|
60
|
+
|
61
|
+
syllables.reverse
|
33
62
|
end
|
34
63
|
|
35
64
|
def parse(pinyin)
|
36
|
-
|
65
|
+
# hanyu_reader cannot parse uppercase pinyin.
|
66
|
+
pinyin = pinyin.downcase
|
67
|
+
|
68
|
+
clusters = pinyin.split(pinyin_separator_regexp)
|
69
|
+
clusters.flat_map {|cluster| parse_cluster(cluster)}.flat_map(&hanyu_reader)
|
37
70
|
end
|
38
71
|
alias call parse
|
39
72
|
|
data/lib/ting/tones/accents.rb
CHANGED
data/lib/ting/version.rb
CHANGED
@@ -0,0 +1,110 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Ting::HanyuPinyinParser do
|
5
|
+
let(:parser) { Ting::HanyuPinyinParser.new }
|
6
|
+
|
7
|
+
it 'should be able to parse boring characters' do
|
8
|
+
pinyin = "xíbié de hǎi'àn"
|
9
|
+
expect(parser.parse(pinyin)).to eq([
|
10
|
+
Ting::Syllable.new( Ting::Initial::Xi, Ting::Final::I, 2 ),
|
11
|
+
Ting::Syllable.new( Ting::Initial::Bo, Ting::Final::Ie, 2 ),
|
12
|
+
Ting::Syllable.new( Ting::Initial::De, Ting::Final::E, 5 ),
|
13
|
+
Ting::Syllable.new( Ting::Initial::He, Ting::Final::Ai, 3 ),
|
14
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 4 ),
|
15
|
+
])
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should be able to parse erhua' do
|
19
|
+
pinyin = "wèir Wèir"
|
20
|
+
expect(parser.parse(pinyin)).to eq([
|
21
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ui, 4 ),
|
22
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
|
23
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ui, 4 ),
|
24
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
|
25
|
+
])
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should be able to discern erhua from other Ri syllables' do
|
29
|
+
pinyin = Ting.pretty_tones 'yang2rou4'
|
30
|
+
expect(parser.parse(pinyin)).to eq([
|
31
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Iang, 2 ),
|
32
|
+
Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::Ou, 4 ),
|
33
|
+
])
|
34
|
+
|
35
|
+
pinyin = Ting.pretty_tones 'sui1ran2'
|
36
|
+
expect(parser.parse(pinyin)).to eq([
|
37
|
+
Ting::Syllable.new( Ting::Initial::Si, Ting::Final::Ui, 1 ),
|
38
|
+
Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::An, 2 ),
|
39
|
+
])
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should parse er4 and her2 correctly' do
|
43
|
+
expect(parser.parse('èr')).to eq([
|
44
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 4 ),
|
45
|
+
])
|
46
|
+
expect(parser.parse('hér')).to eq([
|
47
|
+
Ting::Syllable.new( Ting::Initial::He, Ting::Final::E, 2 ),
|
48
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
|
49
|
+
])
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should parse ou1zhou1 correctly' do
|
53
|
+
pinyin = Ting.pretty_tones('ou1zhou1')
|
54
|
+
expect(parser.parse(pinyin)).to eq([
|
55
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ou, 1 ),
|
56
|
+
Ting::Syllable.new( Ting::Initial::Zhi, Ting::Final::Ou, 1 ),
|
57
|
+
])
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'should parse sheng3lve4 correctly' do
|
61
|
+
expect(parser.parse('shěnglüè')).to eq([
|
62
|
+
Ting::Syllable.new( Ting::Initial::Shi, Ting::Final::Eng, 3 ),
|
63
|
+
Ting::Syllable.new( Ting::Initial::Le, Ting::Final::Ue, 4 ),
|
64
|
+
])
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'should parse regardless of apostrophes and weird whitespace' do
|
68
|
+
pinyin = "Xī'ān\thǎowánr\tma?\nHǎowánr!"
|
69
|
+
expect(parser.parse(pinyin).map(&:tone)).to eq([1, 1, 3, 2, 5, 5, 3, 2, 5])
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'should parse ambiguous syllables based on context' do
|
73
|
+
pinyin = 'gūnánguǎnǚ'
|
74
|
+
expect(parser.parse(pinyin)).to eq([
|
75
|
+
Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::U, 1 ),
|
76
|
+
Ting::Syllable.new( Ting::Initial::Ne, Ting::Final::An, 2 ),
|
77
|
+
Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::Ua, 3 ),
|
78
|
+
Ting::Syllable.new( Ting::Initial::Ne, Ting::Final::V, 3 ),
|
79
|
+
])
|
80
|
+
|
81
|
+
pinyin = 'yángròu'
|
82
|
+
expect(parser.parse(pinyin)).to eq([
|
83
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Iang, 2 ),
|
84
|
+
Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::Ou, 4 ),
|
85
|
+
])
|
86
|
+
end
|
87
|
+
|
88
|
+
it 'should parse some invalid pinyin (missing apostrophe)' do
|
89
|
+
# Syllables that begin with [aeo] must be prefixed with an apostrophe in the middle of the word.
|
90
|
+
# Ref.: https://en.wikipedia.org/wiki/Pinyin#Pronunciation_of_initials, "Note on the apostrophe"
|
91
|
+
# Still, Ting should be able to parse these syllables if they follow unambiguous characters.
|
92
|
+
|
93
|
+
expect(parser.parse('hǎiàn')).to eq([
|
94
|
+
Ting::Syllable.new( Ting::Initial::He, Ting::Final::Ai, 3 ),
|
95
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 4 ),
|
96
|
+
])
|
97
|
+
|
98
|
+
expect(parser.parse('mòshuǐer')).to eq([
|
99
|
+
Ting::Syllable.new( Ting::Initial::Mo, Ting::Final::O, 4 ),
|
100
|
+
Ting::Syllable.new( Ting::Initial::Shi, Ting::Final::Ui, 3 ),
|
101
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
|
102
|
+
])
|
103
|
+
|
104
|
+
expect(parser.parse('gōngānjú')).to eq([
|
105
|
+
Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::Ong, 1 ),
|
106
|
+
Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 1 ),
|
107
|
+
Ting::Syllable.new( Ting::Initial::Ji, Ting::Final::V, 2 ),
|
108
|
+
])
|
109
|
+
end
|
110
|
+
end
|
data/spec/ting_spec.rb
CHANGED
@@ -18,7 +18,18 @@ describe Ting do
|
|
18
18
|
end
|
19
19
|
|
20
20
|
it 'should parse syllables correctly' do
|
21
|
-
expect(Ting.pretty_tones('
|
22
|
-
expect(Ting.bpmf('
|
21
|
+
expect(Ting.pretty_tones('Wo3 de Ou1zhou1 peng2you3 hen3 zhuang4')).to eq('wǒ de ōuzhōu péngyǒu hěn zhuàng')
|
22
|
+
expect(Ting.bpmf('Wo3 de peng2you3 hen3 zhuang4')).to eq('ㄨㄛˇ ㄉㄜ˙ ㄆㄥˊ ㄧㄡˇ ㄏㄣˇ ㄓㄨㄤˋ')
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should be able to pretty-print simple strings' do
|
26
|
+
expect(Ting.pretty_tones('wo3 ai4 ni3')).to eq('wǒ ài nǐ')
|
27
|
+
expect(Ting.pretty_tones('you3dian3r hao3xiao4')).to eq('yǒudiǎnr hǎoxiào')
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should insert apostrophes when appropriate' do
|
31
|
+
expect(Ting.pretty_tones('hai3an4')).to eq("hǎi'àn")
|
32
|
+
expect(Ting.pretty_tones('ding4e2')).to eq("dìng'é")
|
33
|
+
expect(Ting.pretty_tones('an5an5an5an5an')).to eq("an'an'an'an'an")
|
23
34
|
end
|
24
35
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ting
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arne Brasseur
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -104,6 +104,7 @@ files:
|
|
104
104
|
- lib/ting/tones/supernum.rb
|
105
105
|
- lib/ting/version.rb
|
106
106
|
- lib/ting/writer.rb
|
107
|
+
- spec/hanyu_pinyin_parser_spec.rb
|
107
108
|
- spec/jruby_csv_spec.rb
|
108
109
|
- spec/palladius_spec.rb
|
109
110
|
- spec/spec_helper.rb
|
@@ -132,12 +133,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
132
133
|
version: '0'
|
133
134
|
requirements: []
|
134
135
|
rubyforge_project:
|
135
|
-
rubygems_version: 2.6
|
136
|
+
rubygems_version: 2.7.6
|
136
137
|
signing_key:
|
137
138
|
specification_version: 4
|
138
139
|
summary: A conversion library for Chinese transcription methods like Hanyu Pinyin,
|
139
140
|
Bopomofo and Wade-Giles.
|
140
141
|
test_files:
|
142
|
+
- spec/hanyu_pinyin_parser_spec.rb
|
141
143
|
- spec/jruby_csv_spec.rb
|
142
144
|
- spec/palladius_spec.rb
|
143
145
|
- spec/spec_helper.rb
|