ting 0.11.0 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 832c9fb34c5415f06ee7a3c11063930e377ba9cf
4
- data.tar.gz: 599204c5cec9e06962568922c4c477dd868911ba
2
+ SHA256:
3
+ metadata.gz: 6e563c103a40efc2e917643626b84bd6f46f72ab054d9c8c090b1fca418f5226
4
+ data.tar.gz: 79b68c136dfc2bc4b6424f2cd8ac09f861f0ca4df6bd86f54fea24b6bf5c2da1
5
5
  SHA512:
6
- metadata.gz: c35bc7b3cd2a40edb9fa89fe00089a1d3791eb640600a17b7de419beaa902e8047c20d24dda3d9fdb11346eeca7b9ecee3688d7010949edfabdc14a16187d11a
7
- data.tar.gz: eb3868c3100089e6b9d55e71a7cb0d2f157a7a67ccf81e660422e285954897c3c09b3dd970a9264492823a762b64fdcd7a247fe84bd54ffed041dd1f78dce6c5
6
+ metadata.gz: 28b3a045d027467fd22f47c20835bcc1576809b40c6e239b0a40bfc965deb5b4c2ac554d4050758268b8b7bcf063b16a94aa87f85b4a568366824cab9ecf6695
7
+ data.tar.gz: 32f06a2130fb4d530ae63084142ceb82bb54047adaa9ea81cb61691401548342f62b54244e19529996c735c448cb414773c7d793c2859a5550ebcdbae167075f
@@ -9,16 +9,23 @@ All notable changes to this project will be documented in this file. This change
9
9
 
10
10
  ## [Unreleased]
11
11
 
12
+ ## [0.12.0] - released 2018-10-18
13
+
14
+ ### Changed
15
+ - `Ting.pretty_tones` inserts apostrophes before syllables beginning with a, e, o
16
+ - `Ting::HanyuPinyinParser` is better at splitting up compound words correctly
17
+
12
18
  ## [0.11.0] - released 2017-11-06
13
19
 
14
20
  ### Changed
15
21
  - Make `Ting.pretty_tones` work with bopomofo
16
- - Correct name of Palladius (Cyrilic transcription)
22
+ - Correct name of Palladius (Cyrillic transcription)
17
23
  - Add missing IPA finals
18
24
  - Change Palladius transcription of "hui" to хуэй
19
25
 
20
26
  ### Added
21
27
  - `bin/ting_table` script
22
28
 
23
- [Unreleased]: https://github.com/lambdaisland/uri/compare/v0.11.0...HEAD
29
+ [Unreleased]: https://github.com/lambdaisland/uri/compare/v0.12.0...HEAD
30
+ [0.12.0]: https://github.com/plexus/ting/compare/v0.12.0...v0.11.0
24
31
  [0.11.0]: https://github.com/plexus/ting/compare/v0.11.0...v0.10.0
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ting (0.10.0)
4
+ ting (0.12.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -239,4 +239,4 @@ DEPENDENCIES
239
239
  ting!
240
240
 
241
241
  BUNDLED WITH
242
- 1.15.4
242
+ 1.16.6
@@ -50,15 +50,30 @@ module Ting
50
50
  )
51
51
  end
52
52
 
53
+ # The longest syllables are six letters long (chuang, shuang, zhuang).
54
+ SYLLABLE_REGEXP = /[A-Za-züÜ]{1,6}\d?/
53
55
 
54
56
  def pretty_tones(string)
55
- string.gsub('u:','ü').gsub(/[A-Za-züÜ]{1,7}\d?/) do |syll|
56
- SYLLABLE_CACHE[syll]
57
+ string = string.gsub('u:', 'ü') # (note that this implicitly dups the string)
58
+ # Scan through the string, replacing syllable by syllable.
59
+ pos = 0
60
+ while match = string.match(SYLLABLE_REGEXP, pos)
61
+ syllable = match[0]
62
+ replacement = SYLLABLE_CACHE[syllable]
63
+ match_pos = match.begin(0)
64
+ # If this syllable starts with a vowel and is preceded by a letter (not whitespace or
65
+ # control characters), insert an apostrophe before it.
66
+ if match_pos > 0 && string[match_pos - 1] =~ /[[:alpha:]]/ && syllable =~ /^[AEOaoe]/
67
+ replacement = "'" + replacement
68
+ end
69
+ string[match_pos, syllable.length] = replacement
70
+ pos = match_pos + replacement.length
57
71
  end
72
+ string
58
73
  end
59
74
 
60
75
  def bpmf(string)
61
- string.gsub('u:','ü').scan(/[A-Za-züÜ]{1,7}\d?/).map do |m|
76
+ string.gsub('u:', 'ü').scan(SYLLABLE_REGEXP).map do |m|
62
77
  Ting.writer(:zhuyin, :marks).(
63
78
  Ting.reader(:hanyu, :numbers).(m.downcase)
64
79
  )
@@ -111,6 +111,8 @@ module Ting
111
111
  alias :to_s :inspect
112
112
 
113
113
  def ==( other )
114
+ return false unless other.is_a? Syllable
115
+
114
116
  [ other.initial, other.final, other.tone, other.capitalized ] ==
115
117
  [ self.initial, self.final, self.tone, self.capitalized ]
116
118
  end
@@ -14,26 +14,59 @@ module Ting
14
14
  @all_syllables ||= Ting.all_syllables.map(&hanyu_writer).sort_by(&:length).reverse
15
15
  end
16
16
 
17
- def sylls_with_erhua
18
- @with_erhua ||= all_syllables.map{|p| p + 'r'}
17
+ def consonant_syllables
18
+ @consonant_syllables ||= all_syllables.grep(/^[bcdfghjklmnpqrstwxyz]/i)
19
19
  end
20
20
 
21
21
  def pinyin_regexp
22
- @pinyin_regexp ||= Regexp.union(*sylls_with_erhua, *all_syllables)
22
+ # This will parse a cluster of pinyin, i.e. an uninterrupted string of pinyin characters without punctuation.
23
+ @pinyin_cluster_regexp ||= /\A
24
+ # Every syllable can appear at the start of a cluster.
25
+ (#{Regexp.union(all_syllables)})
26
+ # However, only syllables starting with a consonant can follow, as syllables starting with a vowel have to
27
+ # be prefixed with an apostrophe.
28
+ # Since it is common to omit the apostrophe when there is no ambiguity, also allow syllables starting with
29
+ # a vowel after all letters except n and g, and after -ong, since -on does not appear at the end of a valid
30
+ # syllable.
31
+ (#{Regexp.union(consonant_syllables)}|(?<=[^ng]|[ōóǒòo]ng)#{Regexp.union(all_syllables)})*
32
+ (r)?
33
+ \Z/x
23
34
  end
24
35
 
25
- def split_pinyin(pinyin)
26
- pinyin.scan(pinyin_regexp).flat_map do |syll|
27
- if sylls_with_erhua.include?(syll) && ! all_syllables.include?(syll)
28
- [ syll[0..-2], 'er']
29
- else
30
- [ syll ]
36
+ def pinyin_separator_regexp
37
+ # A regular expression that matches every character that can *not* appear in pinyin.
38
+ @pinyin_separator_regexp ||= Regexp.new("[^#{all_syllables.join.downcase.split("").sort.uniq.join}]+")
39
+ end
40
+
41
+ def parse_cluster(pinyin)
42
+ syllables = []
43
+
44
+ # Chop off one syllable at a time from the end by continuously matching the same regular expression.
45
+ # This ensures the pinyin will be split into only valid pinyin syllables. Because a match capture will
46
+ # only contain the *last* content it has matched, we have to use a loop.
47
+ while match = pinyin_regexp.match(pinyin)
48
+ # If an 'r' at the end was matched, this implies that all other parts of the string were matched as
49
+ # syllables, and this cluster uses erhua.
50
+ if 'r' == match[3]
51
+ syllables << 'er'
52
+ pinyin = pinyin.chop
31
53
  end
54
+ last_syllable = match[2] || match[1]
55
+ syllables << last_syllable
56
+ pinyin = pinyin[0, pinyin.length - last_syllable.length]
32
57
  end
58
+
59
+ raise ArgumentError, "Unparseable pinyin fragment encountered: #{pinyin}" if !pinyin.empty?
60
+
61
+ syllables.reverse
33
62
  end
34
63
 
35
64
  def parse(pinyin)
36
- split_pinyin(pinyin).map(&hanyu_reader)
65
+ # hanyu_reader cannot parse uppercase pinyin.
66
+ pinyin = pinyin.downcase
67
+
68
+ clusters = pinyin.split(pinyin_separator_regexp)
69
+ clusters.flat_map {|cluster| parse_cluster(cluster)}.flat_map(&hanyu_reader)
37
70
  end
38
71
  alias call parse
39
72
 
@@ -27,7 +27,7 @@ module Ting
27
27
  when /a/
28
28
  syll.sub(/a/, tone_glyph(:a,tone))
29
29
  when /e/
30
- syll.sub(/e/, tone_glyph(:e,tone))
30
+ syll.sub(/e/, tone_glyph(:e,tone)).sub('v', 'ü')
31
31
  when /o/
32
32
  syll.sub(/o/, tone_glyph(:o,tone))
33
33
  when /(i|u|v)\z/
@@ -1,3 +1,3 @@
1
1
  module Ting
2
- VERSION = '0.11.0'
2
+ VERSION = '0.12.0'
3
3
  end
@@ -0,0 +1,110 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe Ting::HanyuPinyinParser do
5
+ let(:parser) { Ting::HanyuPinyinParser.new }
6
+
7
+ it 'should be able to parse boring characters' do
8
+ pinyin = "xíbié de hǎi'àn"
9
+ expect(parser.parse(pinyin)).to eq([
10
+ Ting::Syllable.new( Ting::Initial::Xi, Ting::Final::I, 2 ),
11
+ Ting::Syllable.new( Ting::Initial::Bo, Ting::Final::Ie, 2 ),
12
+ Ting::Syllable.new( Ting::Initial::De, Ting::Final::E, 5 ),
13
+ Ting::Syllable.new( Ting::Initial::He, Ting::Final::Ai, 3 ),
14
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 4 ),
15
+ ])
16
+ end
17
+
18
+ it 'should be able to parse erhua' do
19
+ pinyin = "wèir Wèir"
20
+ expect(parser.parse(pinyin)).to eq([
21
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ui, 4 ),
22
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
23
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ui, 4 ),
24
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
25
+ ])
26
+ end
27
+
28
+ it 'should be able to discern erhua from other Ri syllables' do
29
+ pinyin = Ting.pretty_tones 'yang2rou4'
30
+ expect(parser.parse(pinyin)).to eq([
31
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Iang, 2 ),
32
+ Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::Ou, 4 ),
33
+ ])
34
+
35
+ pinyin = Ting.pretty_tones 'sui1ran2'
36
+ expect(parser.parse(pinyin)).to eq([
37
+ Ting::Syllable.new( Ting::Initial::Si, Ting::Final::Ui, 1 ),
38
+ Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::An, 2 ),
39
+ ])
40
+ end
41
+
42
+ it 'should parse er4 and her2 correctly' do
43
+ expect(parser.parse('èr')).to eq([
44
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 4 ),
45
+ ])
46
+ expect(parser.parse('hér')).to eq([
47
+ Ting::Syllable.new( Ting::Initial::He, Ting::Final::E, 2 ),
48
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
49
+ ])
50
+ end
51
+
52
+ it 'should parse ou1zhou1 correctly' do
53
+ pinyin = Ting.pretty_tones('ou1zhou1')
54
+ expect(parser.parse(pinyin)).to eq([
55
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ou, 1 ),
56
+ Ting::Syllable.new( Ting::Initial::Zhi, Ting::Final::Ou, 1 ),
57
+ ])
58
+ end
59
+
60
+ it 'should parse sheng3lve4 correctly' do
61
+ expect(parser.parse('shěnglüè')).to eq([
62
+ Ting::Syllable.new( Ting::Initial::Shi, Ting::Final::Eng, 3 ),
63
+ Ting::Syllable.new( Ting::Initial::Le, Ting::Final::Ue, 4 ),
64
+ ])
65
+ end
66
+
67
+ it 'should parse regardless of apostrophes and weird whitespace' do
68
+ pinyin = "Xī'ān\thǎowánr\tma?\nHǎowánr!"
69
+ expect(parser.parse(pinyin).map(&:tone)).to eq([1, 1, 3, 2, 5, 5, 3, 2, 5])
70
+ end
71
+
72
+ it 'should parse ambiguous syllables based on context' do
73
+ pinyin = 'gūnánguǎnǚ'
74
+ expect(parser.parse(pinyin)).to eq([
75
+ Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::U, 1 ),
76
+ Ting::Syllable.new( Ting::Initial::Ne, Ting::Final::An, 2 ),
77
+ Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::Ua, 3 ),
78
+ Ting::Syllable.new( Ting::Initial::Ne, Ting::Final::V, 3 ),
79
+ ])
80
+
81
+ pinyin = 'yángròu'
82
+ expect(parser.parse(pinyin)).to eq([
83
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Iang, 2 ),
84
+ Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::Ou, 4 ),
85
+ ])
86
+ end
87
+
88
+ it 'should parse some invalid pinyin (missing apostrophe)' do
89
+ # Syllables that begin with [aeo] must be prefixed with an apostrophe in the middle of the word.
90
+ # Ref.: https://en.wikipedia.org/wiki/Pinyin#Pronunciation_of_initials, "Note on the apostrophe"
91
+ # Still, Ting should be able to parse these syllables if they follow unambiguous characters.
92
+
93
+ expect(parser.parse('hǎiàn')).to eq([
94
+ Ting::Syllable.new( Ting::Initial::He, Ting::Final::Ai, 3 ),
95
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 4 ),
96
+ ])
97
+
98
+ expect(parser.parse('mòshuǐer')).to eq([
99
+ Ting::Syllable.new( Ting::Initial::Mo, Ting::Final::O, 4 ),
100
+ Ting::Syllable.new( Ting::Initial::Shi, Ting::Final::Ui, 3 ),
101
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
102
+ ])
103
+
104
+ expect(parser.parse('gōngānjú')).to eq([
105
+ Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::Ong, 1 ),
106
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 1 ),
107
+ Ting::Syllable.new( Ting::Initial::Ji, Ting::Final::V, 2 ),
108
+ ])
109
+ end
110
+ end
@@ -18,7 +18,18 @@ describe Ting do
18
18
  end
19
19
 
20
20
  it 'should parse syllables correctly' do
21
- expect(Ting.pretty_tones('wo3 de peng2you3 hen3 zhuang4')).to eq('wǒ de péngyǒu hěn zhuàng')
22
- expect(Ting.bpmf('wo3 de peng2you3 hen3 zhuang4')).to eq('ㄨㄛˇ ㄉㄜ˙ ㄆㄥˊ ㄧㄡˇ ㄏㄣˇ ㄓㄨㄤˋ')
21
+ expect(Ting.pretty_tones('Wo3 de Ou1zhou1 peng2you3 hen3 zhuang4')).to eq('wǒ de ōuzhōu péngyǒu hěn zhuàng')
22
+ expect(Ting.bpmf('Wo3 de peng2you3 hen3 zhuang4')).to eq('ㄨㄛˇ ㄉㄜ˙ ㄆㄥˊ ㄧㄡˇ ㄏㄣˇ ㄓㄨㄤˋ')
23
+ end
24
+
25
+ it 'should be able to pretty-print simple strings' do
26
+ expect(Ting.pretty_tones('wo3 ai4 ni3')).to eq('wǒ ài nǐ')
27
+ expect(Ting.pretty_tones('you3dian3r hao3xiao4')).to eq('yǒudiǎnr hǎoxiào')
28
+ end
29
+
30
+ it 'should insert apostrophes when appropriate' do
31
+ expect(Ting.pretty_tones('hai3an4')).to eq("hǎi'àn")
32
+ expect(Ting.pretty_tones('ding4e2')).to eq("dìng'é")
33
+ expect(Ting.pretty_tones('an5an5an5an5an')).to eq("an'an'an'an'an")
23
34
  end
24
35
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ting
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arne Brasseur
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-11-06 00:00:00.000000000 Z
11
+ date: 2018-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -104,6 +104,7 @@ files:
104
104
  - lib/ting/tones/supernum.rb
105
105
  - lib/ting/version.rb
106
106
  - lib/ting/writer.rb
107
+ - spec/hanyu_pinyin_parser_spec.rb
107
108
  - spec/jruby_csv_spec.rb
108
109
  - spec/palladius_spec.rb
109
110
  - spec/spec_helper.rb
@@ -132,12 +133,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
132
133
  version: '0'
133
134
  requirements: []
134
135
  rubyforge_project:
135
- rubygems_version: 2.6.13
136
+ rubygems_version: 2.7.6
136
137
  signing_key:
137
138
  specification_version: 4
138
139
  summary: A conversion library for Chinese transcription methods like Hanyu Pinyin,
139
140
  Bopomofo and Wade-Giles.
140
141
  test_files:
142
+ - spec/hanyu_pinyin_parser_spec.rb
141
143
  - spec/jruby_csv_spec.rb
142
144
  - spec/palladius_spec.rb
143
145
  - spec/spec_helper.rb