ting 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 832c9fb34c5415f06ee7a3c11063930e377ba9cf
4
- data.tar.gz: 599204c5cec9e06962568922c4c477dd868911ba
2
+ SHA256:
3
+ metadata.gz: 6e563c103a40efc2e917643626b84bd6f46f72ab054d9c8c090b1fca418f5226
4
+ data.tar.gz: 79b68c136dfc2bc4b6424f2cd8ac09f861f0ca4df6bd86f54fea24b6bf5c2da1
5
5
  SHA512:
6
- metadata.gz: c35bc7b3cd2a40edb9fa89fe00089a1d3791eb640600a17b7de419beaa902e8047c20d24dda3d9fdb11346eeca7b9ecee3688d7010949edfabdc14a16187d11a
7
- data.tar.gz: eb3868c3100089e6b9d55e71a7cb0d2f157a7a67ccf81e660422e285954897c3c09b3dd970a9264492823a762b64fdcd7a247fe84bd54ffed041dd1f78dce6c5
6
+ metadata.gz: 28b3a045d027467fd22f47c20835bcc1576809b40c6e239b0a40bfc965deb5b4c2ac554d4050758268b8b7bcf063b16a94aa87f85b4a568366824cab9ecf6695
7
+ data.tar.gz: 32f06a2130fb4d530ae63084142ceb82bb54047adaa9ea81cb61691401548342f62b54244e19529996c735c448cb414773c7d793c2859a5550ebcdbae167075f
@@ -9,16 +9,23 @@ All notable changes to this project will be documented in this file. This change
9
9
 
10
10
  ## [Unreleased]
11
11
 
12
+ ## [0.12.0] - released 2018-10-18
13
+
14
+ ### Changed
15
+ - `Ting.pretty_tones` inserts apostrophes before syllables beginning with a, e, o
16
+ - `Ting::HanyuPinyinParser` is better at splitting up compound words correctly
17
+
12
18
  ## [0.11.0] - released 2017-11-06
13
19
 
14
20
  ### Changed
15
21
  - Make `Ting.pretty_tones` work with bopomofo
16
- - Correct name of Palladius (Cyrilic transcription)
22
+ - Correct name of Palladius (Cyrillic transcription)
17
23
  - Add missing IPA finals
18
24
  - Change Palladius transcription of "hui" to хуэй
19
25
 
20
26
  ### Added
21
27
  - `bin/ting_table` script
22
28
 
23
- [Unreleased]: https://github.com/lambdaisland/uri/compare/v0.11.0...HEAD
29
+ [Unreleased]: https://github.com/lambdaisland/uri/compare/v0.12.0...HEAD
30
+ [0.12.0]: https://github.com/plexus/ting/compare/v0.12.0...v0.11.0
24
31
  [0.11.0]: https://github.com/plexus/ting/compare/v0.11.0...v0.10.0
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- ting (0.10.0)
4
+ ting (0.12.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -239,4 +239,4 @@ DEPENDENCIES
239
239
  ting!
240
240
 
241
241
  BUNDLED WITH
242
- 1.15.4
242
+ 1.16.6
@@ -50,15 +50,30 @@ module Ting
50
50
  )
51
51
  end
52
52
 
53
+ # The longest syllables are six letters long (chuang, shuang, zhuang).
54
+ SYLLABLE_REGEXP = /[A-Za-züÜ]{1,6}\d?/
53
55
 
54
56
  def pretty_tones(string)
55
- string.gsub('u:','ü').gsub(/[A-Za-züÜ]{1,7}\d?/) do |syll|
56
- SYLLABLE_CACHE[syll]
57
+ string = string.gsub('u:', 'ü') # (note that this implicitly dups the string)
58
+ # Scan through the string, replacing syllable by syllable.
59
+ pos = 0
60
+ while match = string.match(SYLLABLE_REGEXP, pos)
61
+ syllable = match[0]
62
+ replacement = SYLLABLE_CACHE[syllable]
63
+ match_pos = match.begin(0)
64
+ # If this syllable starts with a vowel and is preceded by a letter (not whitespace or
65
+ # control characters), insert an apostrophe before it.
66
+ if match_pos > 0 && string[match_pos - 1] =~ /[[:alpha:]]/ && syllable =~ /^[AEOaoe]/
67
+ replacement = "'" + replacement
68
+ end
69
+ string[match_pos, syllable.length] = replacement
70
+ pos = match_pos + replacement.length
57
71
  end
72
+ string
58
73
  end
59
74
 
60
75
  def bpmf(string)
61
- string.gsub('u:','ü').scan(/[A-Za-züÜ]{1,7}\d?/).map do |m|
76
+ string.gsub('u:', 'ü').scan(SYLLABLE_REGEXP).map do |m|
62
77
  Ting.writer(:zhuyin, :marks).(
63
78
  Ting.reader(:hanyu, :numbers).(m.downcase)
64
79
  )
@@ -111,6 +111,8 @@ module Ting
111
111
  alias :to_s :inspect
112
112
 
113
113
  def ==( other )
114
+ return false unless other.is_a? Syllable
115
+
114
116
  [ other.initial, other.final, other.tone, other.capitalized ] ==
115
117
  [ self.initial, self.final, self.tone, self.capitalized ]
116
118
  end
@@ -14,26 +14,59 @@ module Ting
14
14
  @all_syllables ||= Ting.all_syllables.map(&hanyu_writer).sort_by(&:length).reverse
15
15
  end
16
16
 
17
- def sylls_with_erhua
18
- @with_erhua ||= all_syllables.map{|p| p + 'r'}
17
+ def consonant_syllables
18
+ @consonant_syllables ||= all_syllables.grep(/^[bcdfghjklmnpqrstwxyz]/i)
19
19
  end
20
20
 
21
21
  def pinyin_regexp
22
- @pinyin_regexp ||= Regexp.union(*sylls_with_erhua, *all_syllables)
22
+ # This will parse a cluster of pinyin, i.e. an uninterrupted string of pinyin characters without punctuation.
23
+ @pinyin_cluster_regexp ||= /\A
24
+ # Every syllable can appear at the start of a cluster.
25
+ (#{Regexp.union(all_syllables)})
26
+ # However, only syllables starting with a consonant can follow, as syllables starting with a vowel have to
27
+ # be prefixed with an apostrophe.
28
+ # Since it is common to omit the apostrophe when there is no ambiguity, also allow syllables starting with
29
+ # a vowel after all letters except n and g, and after -ong, since -on does not appear at the end of a valid
30
+ # syllable.
31
+ (#{Regexp.union(consonant_syllables)}|(?<=[^ng]|[ōóǒòo]ng)#{Regexp.union(all_syllables)})*
32
+ (r)?
33
+ \Z/x
23
34
  end
24
35
 
25
- def split_pinyin(pinyin)
26
- pinyin.scan(pinyin_regexp).flat_map do |syll|
27
- if sylls_with_erhua.include?(syll) && ! all_syllables.include?(syll)
28
- [ syll[0..-2], 'er']
29
- else
30
- [ syll ]
36
+ def pinyin_separator_regexp
37
+ # A regular expression that matches every character that can *not* appear in pinyin.
38
+ @pinyin_separator_regexp ||= Regexp.new("[^#{all_syllables.join.downcase.split("").sort.uniq.join}]+")
39
+ end
40
+
41
+ def parse_cluster(pinyin)
42
+ syllables = []
43
+
44
+ # Chop off one syllable at a time from the end by continuously matching the same regular expression.
45
+ # This ensures the pinyin will be split into only valid pinyin syllables. Because a match capture will
46
+ # only contain the *last* content it has matched, we have to use a loop.
47
+ while match = pinyin_regexp.match(pinyin)
48
+ # If an 'r' at the end was matched, this implies that all other parts of the string were matched as
49
+ # syllables, and this cluster uses erhua.
50
+ if 'r' == match[3]
51
+ syllables << 'er'
52
+ pinyin = pinyin.chop
31
53
  end
54
+ last_syllable = match[2] || match[1]
55
+ syllables << last_syllable
56
+ pinyin = pinyin[0, pinyin.length - last_syllable.length]
32
57
  end
58
+
59
+ raise ArgumentError, "Unparseable pinyin fragment encountered: #{pinyin}" if !pinyin.empty?
60
+
61
+ syllables.reverse
33
62
  end
34
63
 
35
64
  def parse(pinyin)
36
- split_pinyin(pinyin).map(&hanyu_reader)
65
+ # hanyu_reader cannot parse uppercase pinyin.
66
+ pinyin = pinyin.downcase
67
+
68
+ clusters = pinyin.split(pinyin_separator_regexp)
69
+ clusters.flat_map {|cluster| parse_cluster(cluster)}.flat_map(&hanyu_reader)
37
70
  end
38
71
  alias call parse
39
72
 
@@ -27,7 +27,7 @@ module Ting
27
27
  when /a/
28
28
  syll.sub(/a/, tone_glyph(:a,tone))
29
29
  when /e/
30
- syll.sub(/e/, tone_glyph(:e,tone))
30
+ syll.sub(/e/, tone_glyph(:e,tone)).sub('v', 'ü')
31
31
  when /o/
32
32
  syll.sub(/o/, tone_glyph(:o,tone))
33
33
  when /(i|u|v)\z/
@@ -1,3 +1,3 @@
1
1
  module Ting
2
- VERSION = '0.11.0'
2
+ VERSION = '0.12.0'
3
3
  end
@@ -0,0 +1,110 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe Ting::HanyuPinyinParser do
5
+ let(:parser) { Ting::HanyuPinyinParser.new }
6
+
7
+ it 'should be able to parse boring characters' do
8
+ pinyin = "xíbié de hǎi'àn"
9
+ expect(parser.parse(pinyin)).to eq([
10
+ Ting::Syllable.new( Ting::Initial::Xi, Ting::Final::I, 2 ),
11
+ Ting::Syllable.new( Ting::Initial::Bo, Ting::Final::Ie, 2 ),
12
+ Ting::Syllable.new( Ting::Initial::De, Ting::Final::E, 5 ),
13
+ Ting::Syllable.new( Ting::Initial::He, Ting::Final::Ai, 3 ),
14
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 4 ),
15
+ ])
16
+ end
17
+
18
+ it 'should be able to parse erhua' do
19
+ pinyin = "wèir Wèir"
20
+ expect(parser.parse(pinyin)).to eq([
21
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ui, 4 ),
22
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
23
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ui, 4 ),
24
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
25
+ ])
26
+ end
27
+
28
+ it 'should be able to discern erhua from other Ri syllables' do
29
+ pinyin = Ting.pretty_tones 'yang2rou4'
30
+ expect(parser.parse(pinyin)).to eq([
31
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Iang, 2 ),
32
+ Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::Ou, 4 ),
33
+ ])
34
+
35
+ pinyin = Ting.pretty_tones 'sui1ran2'
36
+ expect(parser.parse(pinyin)).to eq([
37
+ Ting::Syllable.new( Ting::Initial::Si, Ting::Final::Ui, 1 ),
38
+ Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::An, 2 ),
39
+ ])
40
+ end
41
+
42
+ it 'should parse er4 and her2 correctly' do
43
+ expect(parser.parse('èr')).to eq([
44
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 4 ),
45
+ ])
46
+ expect(parser.parse('hér')).to eq([
47
+ Ting::Syllable.new( Ting::Initial::He, Ting::Final::E, 2 ),
48
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
49
+ ])
50
+ end
51
+
52
+ it 'should parse ou1zhou1 correctly' do
53
+ pinyin = Ting.pretty_tones('ou1zhou1')
54
+ expect(parser.parse(pinyin)).to eq([
55
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Ou, 1 ),
56
+ Ting::Syllable.new( Ting::Initial::Zhi, Ting::Final::Ou, 1 ),
57
+ ])
58
+ end
59
+
60
+ it 'should parse sheng3lve4 correctly' do
61
+ expect(parser.parse('shěnglüè')).to eq([
62
+ Ting::Syllable.new( Ting::Initial::Shi, Ting::Final::Eng, 3 ),
63
+ Ting::Syllable.new( Ting::Initial::Le, Ting::Final::Ue, 4 ),
64
+ ])
65
+ end
66
+
67
+ it 'should parse regardless of apostrophes and weird whitespace' do
68
+ pinyin = "Xī'ān\thǎowánr\tma?\nHǎowánr!"
69
+ expect(parser.parse(pinyin).map(&:tone)).to eq([1, 1, 3, 2, 5, 5, 3, 2, 5])
70
+ end
71
+
72
+ it 'should parse ambiguous syllables based on context' do
73
+ pinyin = 'gūnánguǎnǚ'
74
+ expect(parser.parse(pinyin)).to eq([
75
+ Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::U, 1 ),
76
+ Ting::Syllable.new( Ting::Initial::Ne, Ting::Final::An, 2 ),
77
+ Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::Ua, 3 ),
78
+ Ting::Syllable.new( Ting::Initial::Ne, Ting::Final::V, 3 ),
79
+ ])
80
+
81
+ pinyin = 'yángròu'
82
+ expect(parser.parse(pinyin)).to eq([
83
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Iang, 2 ),
84
+ Ting::Syllable.new( Ting::Initial::Ri, Ting::Final::Ou, 4 ),
85
+ ])
86
+ end
87
+
88
+ it 'should parse some invalid pinyin (missing apostrophe)' do
89
+ # Syllables that begin with [aeo] must be prefixed with an apostrophe in the middle of the word.
90
+ # Ref.: https://en.wikipedia.org/wiki/Pinyin#Pronunciation_of_initials, "Note on the apostrophe"
91
+ # Still, Ting should be able to parse these syllables if they follow unambiguous characters.
92
+
93
+ expect(parser.parse('hǎiàn')).to eq([
94
+ Ting::Syllable.new( Ting::Initial::He, Ting::Final::Ai, 3 ),
95
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 4 ),
96
+ ])
97
+
98
+ expect(parser.parse('mòshuǐer')).to eq([
99
+ Ting::Syllable.new( Ting::Initial::Mo, Ting::Final::O, 4 ),
100
+ Ting::Syllable.new( Ting::Initial::Shi, Ting::Final::Ui, 3 ),
101
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::Er, 5 ),
102
+ ])
103
+
104
+ expect(parser.parse('gōngānjú')).to eq([
105
+ Ting::Syllable.new( Ting::Initial::Ge, Ting::Final::Ong, 1 ),
106
+ Ting::Syllable.new( Ting::Initial::Empty, Ting::Final::An, 1 ),
107
+ Ting::Syllable.new( Ting::Initial::Ji, Ting::Final::V, 2 ),
108
+ ])
109
+ end
110
+ end
@@ -18,7 +18,18 @@ describe Ting do
18
18
  end
19
19
 
20
20
  it 'should parse syllables correctly' do
21
- expect(Ting.pretty_tones('wo3 de peng2you3 hen3 zhuang4')).to eq('wǒ de péngyǒu hěn zhuàng')
22
- expect(Ting.bpmf('wo3 de peng2you3 hen3 zhuang4')).to eq('ㄨㄛˇ ㄉㄜ˙ ㄆㄥˊ ㄧㄡˇ ㄏㄣˇ ㄓㄨㄤˋ')
21
+ expect(Ting.pretty_tones('Wo3 de Ou1zhou1 peng2you3 hen3 zhuang4')).to eq('wǒ de ōuzhōu péngyǒu hěn zhuàng')
22
+ expect(Ting.bpmf('Wo3 de peng2you3 hen3 zhuang4')).to eq('ㄨㄛˇ ㄉㄜ˙ ㄆㄥˊ ㄧㄡˇ ㄏㄣˇ ㄓㄨㄤˋ')
23
+ end
24
+
25
+ it 'should be able to pretty-print simple strings' do
26
+ expect(Ting.pretty_tones('wo3 ai4 ni3')).to eq('wǒ ài nǐ')
27
+ expect(Ting.pretty_tones('you3dian3r hao3xiao4')).to eq('yǒudiǎnr hǎoxiào')
28
+ end
29
+
30
+ it 'should insert apostrophes when appropriate' do
31
+ expect(Ting.pretty_tones('hai3an4')).to eq("hǎi'àn")
32
+ expect(Ting.pretty_tones('ding4e2')).to eq("dìng'é")
33
+ expect(Ting.pretty_tones('an5an5an5an5an')).to eq("an'an'an'an'an")
23
34
  end
24
35
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ting
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arne Brasseur
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-11-06 00:00:00.000000000 Z
11
+ date: 2018-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -104,6 +104,7 @@ files:
104
104
  - lib/ting/tones/supernum.rb
105
105
  - lib/ting/version.rb
106
106
  - lib/ting/writer.rb
107
+ - spec/hanyu_pinyin_parser_spec.rb
107
108
  - spec/jruby_csv_spec.rb
108
109
  - spec/palladius_spec.rb
109
110
  - spec/spec_helper.rb
@@ -132,12 +133,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
132
133
  version: '0'
133
134
  requirements: []
134
135
  rubyforge_project:
135
- rubygems_version: 2.6.13
136
+ rubygems_version: 2.7.6
136
137
  signing_key:
137
138
  specification_version: 4
138
139
  summary: A conversion library for Chinese transcription methods like Hanyu Pinyin,
139
140
  Bopomofo and Wade-Giles.
140
141
  test_files:
142
+ - spec/hanyu_pinyin_parser_spec.rb
141
143
  - spec/jruby_csv_spec.rb
142
144
  - spec/palladius_spec.rb
143
145
  - spec/spec_helper.rb