cantonese 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4ddaa4f96c77b0c35bb61e83b2feef43a234f844
4
- data.tar.gz: 5684236f0d3ba3626de4a10ebdc1ac9eeeac3734
3
+ metadata.gz: cd26a3b32ad12b765087bfacfc12d15872fb8449
4
+ data.tar.gz: 237081e5067765b2687f85814cde13f52274c5bf
5
5
  SHA512:
6
- metadata.gz: a02f42b7aac35e3e2071114a868be195d592f2a93eb087f304afa058700d62664d5595160ea57d208b49bb2c3eb0f7cbc4b973b3e70b41933ee9682d87be0f81
7
- data.tar.gz: efc2ba3e781a25a0a2cf72c75d52eda78298a6d0818cbbc382f1832042a240ccfce9989ba6f2a10331806eb911c40bde627e80e731f16c345fe82d0e9098cf03
6
+ metadata.gz: 04730e07d52a91228cd157eaaf08ccc012fcc2ae453dc37bde26ffa7d5e413ebcef3b1d3c0fdf9491dd14bf47b34101a65a5c464d68ab785708ce5c08a0c7449
7
+ data.tar.gz: 7cca9753f3bd2fcbd9e954252c33404c8030dacb48f8fc14753a5d513ea20c13e08b13ec070445663001953302ffda7933039bfde62ba584705dff0e856d880d
@@ -18,6 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.require_paths = ["lib"]
19
19
 
20
20
  spec.add_dependency "nokogiri"
21
+ spec.add_dependency "tidy_ffi"
21
22
 
22
23
  spec.add_development_dependency "bundler", "~> 1.5"
23
24
  spec.add_development_dependency "rake"
@@ -1,16 +1,12 @@
1
1
  require 'nokogiri'
2
2
  require 'open-uri'
3
3
  require 'cgi'
4
+ require 'tidy_ffi'
4
5
 
5
6
  module Cantonese
6
7
  module Scraper
7
8
  class WordScraper
8
9
  def crawl(word)
9
- html = fetch(word)
10
- process(html)
11
- end
12
-
13
- def fetch(word)
14
10
  # convert word parameter into big5
15
11
  word_big5 = word.encode('Big5', 'UTF-8', :invalid => :replace, :undef => :replace, :replace => '')
16
12
  url = "http://humanum.arts.cuhk.edu.hk/Lexis/lexi-can/search.php?q=" + CGI.escape(word_big5)
@@ -18,11 +14,9 @@ module Cantonese
18
14
  # fetch and get the page in UTF8
19
15
  html = open(url).read
20
16
  html = html.encode('UTF-8', 'Big5', :invalid => :replace, :undef => :replace, :replace => '?')
21
- end
17
+ html = TidyFFI::Tidy.clean(html.gsub(/\0/, ''))
22
18
 
23
- def process(html)
24
- doc = Nokogiri::HTML(html, nil, 'UTF-8')
25
-
19
+ doc = Nokogiri::HTML(html, nil, 'UTF-8')
26
20
  word = doc.search(".w").first.text
27
21
 
28
22
  radical_id = doc.search("//*[@class = 't' and .='部首:']/following-sibling::td[1]").text.strip.tr('[] ', '').to_i rescue nil
@@ -36,9 +30,9 @@ module Cantonese
36
30
 
37
31
  syllable = doc.search('//form/table[1]/tr[position()>1]').collect do |row|
38
32
  sound = row.search("./td[1]")
39
- initial = sound.xpath("./*[@color='red']").text rescue ""
40
- final = sound.xpath("./*[@color='green']").text rescue ""
41
- tone = sound.xpath("./*[@color='blue']").text rescue ""
33
+ initial = sound.xpath("./*[@color='red']").text.strip rescue ""
34
+ final = sound.xpath("./*[@color='green']").text.strip rescue ""
35
+ tone = sound.xpath("./*[@color='blue']").text.strip rescue ""
42
36
  sound_text = sound.text
43
37
  pronunciation = "http://humanum.arts.cuhk.edu.hk/Lexis/lexi-can/sound/#{sound_text}.wav"
44
38
 
@@ -52,8 +46,17 @@ module Cantonese
52
46
  note_text = nil
53
47
  end
54
48
 
49
+ full = "#{initial}#{final}#{tone}"
50
+
51
+ # patch to fix error on database
52
+ if full == "6bwik1"
53
+ full = "kwik1"
54
+ initial = "k"
55
+ pronunciation = "http://humanum.arts.cuhk.edu.hk/Lexis/lexi-can/sound/#{full}.wav"
56
+ end
57
+
55
58
  {
56
- :full => "#{initial}#{final}#{tone}",
59
+ :full => full,
57
60
  :initial => initial,
58
61
  :final => final,
59
62
  :tone => tone,
@@ -62,8 +65,8 @@ module Cantonese
62
65
  :note => note_text
63
66
  }
64
67
  end
65
-
66
68
  {
69
+ :url => url,
67
70
  :text => word,
68
71
  :radical_id => radical_id,
69
72
  :stroke => stroke,
@@ -1,3 +1,3 @@
1
1
  module Cantonese
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.1"
3
3
  end
@@ -0,0 +1,325 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://humanum.arts.cuhk.edu.hk/Lexis/lexi-can/search.php?q=%F3p
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ Accept-Encoding:
11
+ - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
12
+ Accept:
13
+ - "*/*"
14
+ User-Agent:
15
+ - Ruby
16
+ response:
17
+ status:
18
+ code: 200
19
+ message: OK
20
+ headers:
21
+ Date:
22
+ - Mon, 31 Mar 2014 10:23:38 GMT
23
+ Server:
24
+ - Apache/2.2.15 (CentOS)
25
+ X-Powered-By:
26
+ - PHP/5.3.3
27
+ Content-Length:
28
+ - '5636'
29
+ Connection:
30
+ - close
31
+ Content-Type:
32
+ - text/html
33
+ body:
34
+ encoding: ASCII-8BIT
35
+ string: !binary |-
36
+ PGh0bWw+PGhlYWQ+PHRpdGxlPrhmu3m8Zq21sHS1/KZyrnc8L3RpdGxlPjxz
37
+ dHlsZSB0eXBlPSJ0ZXh0L2NzcyI+YSB7IHRleHQtZGVjb3JhdGlvbjogbm9u
38
+ ZX0gLnRleHQgeyBsaW5lLWhlaWdodDogMTUwJSB9PC9zdHlsZT48bWV0YSBo
39
+ dHRwLWVxdWl2PSJDb250ZW50LVR5cGUiIGNvbnRlbnQ9InRleHQvaHRtbDsg
40
+ Y2hhcnNldD1iaWc1Ij48c2NyaXB0IGxhbmd1YWdlPSJKYXZhU2NyaXB0Ij4K
41
+ PCEtLQpmdW5jdGlvbiBNTV9qdW1wTWVudSh0YXJnLHNlbE9iaixyZXN0b3Jl
42
+ KXsgLy92My4wCiAgZXZhbCh0YXJnKyIubG9jYXRpb249JyIrc2VsT2JqLm9w
43
+ dGlvbnNbc2VsT2JqLnNlbGVjdGVkSW5kZXhdLnZhbHVlKyInIik7CiAgaWYg
44
+ KHJlc3RvcmUpIHNlbE9iai5zZWxlY3RlZEluZGV4PTA7Cn0KZnVuY3Rpb24g
45
+ cmVmICh1cmwpIHsKICByZXdpbj13aW5kb3cub3Blbih1cmwsJ3JlZicsJ3Rv
46
+ b2Jhcj0wLHN0YXR1cz0wLHNjcm9sbGJhcnM9MSxyZXNpemFibGU9MSx3aWR0
47
+ aD02MDAsaGVpZ2h0PTMwMCcpOwogIHNldFRpbWVvdXQgKCdyZXdpbi5mb2N1
48
+ cygpJywgMTAwKTsKfQovLy0tPgo8L3NjcmlwdD4KPHN0eWxlIHR5cGU9InRl
49
+ eHQvY3NzIj4KLnQgeyBmb250LXNpemU6IDEzOyBub3dyYXA7IHRleHQtYWxp
50
+ Z246IHJpZ2h0OyBjb2xvcjogbmF2eX0KLnQyIHsgZm9udC1zaXplOiAxMzsg
51
+ bm93cmFwOyB0ZXh0LWFsaWduOiBsZWZ0fQoudDMgeyBmb250LXNpemU6IDEz
52
+ OyBub3dyYXA7IHRleHQtYWxpZ246IGNlbnRlcn0KLncgeyBmb250LXNpemU6
53
+ IDM2OyBmb250LXdlaWdodDogYm9sZDsgY29sb3I6IHJlZDsgdGV4dC1hbGln
54
+ bjogY2VudGVyIH0KPC9zdHlsZT4KPHNjcmlwdCBsYW5ndWFnZT0iSmF2YVNj
55
+ cmlwdCI+CmZ1bmN0aW9uIHhpZF9kb3duKFhpZCkgewoJaWYgKGRvY3VtZW50
56
+ LmFsbFtYaWRdLnN0eWxlLmRpc3BsYXkgPT0gIm5vbmUiKSB7CgkJZG9jdW1l
57
+ bnQuYWxsW1hpZF0uc3R5bGUuZGlzcGxheSA9ICJibG9jayI7Cgl9IGVsc2Ug
58
+ ewoJCWRvY3VtZW50LmFsbFtYaWRdLnN0eWxlLmRpc3BsYXkgPSAibm9uZSI7
59
+ Cgl9Cn0KPC9zY3JpcHQ+PC9oZWFkPjxib2R5IGJhY2tncm91bmQ9Ii9MZXhp
60
+ cy9sZXhpLWNhbi9pbWcvcHBiazAxNC5qcGciID48dGFibGUgd2lkdGg9IjEw
61
+ MCUiIGJvcmRlcj0iMCI+CiAgPHRyPiAKICAgIDx0ZCByb3dzcGFuPSIyIiBj
62
+ bGFzcz13PvNwPC90ZD4KICAgIDx0ZCBjbGFzcz10PrOhrbo6PC90ZD4KCQk8
63
+ dGQgY2xhc3M9dDI+PGEgaHJlZj0icmFkLXN0ci5waHA/cmFkPTE2NyI+PGlt
64
+ ZyBzcmM9ImltZy9yYWQvcmFkMTY3LmdpZiIgYm9yZGVyPTAgYWxpZ249YWJz
65
+ bWlkZGxlPiBbMTY3XTwvYT48L3RkPgogICAgPHRkIGNsYXNzPXQ+tae1ZTo8
66
+ L3RkPgoJCTx0ZCBjbGFzcz10Mj48YSBocmVmPSJyYWQtc3RyLnBocD9zdHI9
67
+ MTkiPjE5PC9hPjwvdGQ+CiAgICA8dGQgY2xhc3M9dD6mcq21pMDD/jo8L3Rk
68
+ PgoJCTx0ZCBjbGFzcz10MyBiZ2NvbG9yPXllbGxvdz48YSBocmVmPSJjbGFz
69
+ c2lmaWVkLnBocD9zdD0yIj6vfa21pnI8L2E+PC90ZD4KCQk8dGQgYWxpZ249
70
+ Y2VudGVyPjxhIGhyZWY9IiMiIG9uQ2xpY2s9InJlZignaHR0cDovL3pob25n
71
+ d2VuLmNvbS9kLzI0My94MTEyLmh0bScpIj48aW1nIHNyYz0iL0ltZy96aG9u
72
+ Z3B1LmpwZyIgYm9yZGVyPTA+PC9hPiA8IS0tYSBocmVmPSIjIiBvbkNsaWNr
73
+ PSJyZWYoJ2h0dHA6Ly8xNDAuMTExLjM0LjQ2L2NnaS1iaW4vZGljdC9uZXdz
74
+ ZWFyY2guY2dpP0RhdGFiYXNlPWRpY3QmUXVlcnlTY29wZT1OYW1lJlF1ZXJ5
75
+ Q29tbWFuZD1maW5kJkdyYXBoaWNXb3JkPXllcyZRdWVyeVN0cmluZz0lRjNw
76
+ JykiLS0+CgkJPGEgaHJlZj0iIyIgb25DbGljaz0icmVmKCdodHRwOi8vMTQw
77
+ LjExMS4zNC40Ni9jZ2ktYmluL25ld0RpY3QvZGljdC5zaD9jb25kPSVGM3Am
78
+ cGllY2VMZW49NTAmZmxkPTEmY2F0PSZ1a2V5PS02MjQ3MjExODgmc2VyaWFs
79
+ PTMmcmVjTm89MCZvcD0maW1nRm9udD0xJykiPgoJCTxpbWcgc3JjPSIvSW1n
80
+ L2d5Y2QyYS5naWYiIGJvcmRlcj0wPjwvYT48L3RkPgogIDwvdHI+CiAgPHRy
81
+ PiAKICAgIDx0ZCBjbGFzcz10PqRqpK29WDo8L3RkPgoJCTx0ZCBjbGFzcz10
82
+ Mj5GMzcwPC90ZD4KICAgIDx0ZCBjbGFzcz10Pq3cvmW9WDo8L3RkPgoJCTx0
83
+ ZCBjbGFzcz10Mj6q96Tgw/ik3zwvdGQ+CiAgICA8dGQgY2xhc3M9dD7AV6fH
84
+ IC8gwFemuDo8L3RkPgoJCTx0ZCBjbGFzcz10Mj4tIC8gMDwvdGQ+CiAgICA8
85
+ dGQgYWxpZ249Y2VudGVyPjxhIGhyZWY9IiMiIG9uQ2xpY2s9InJlZignaHR0
86
+ cDovL2Vwc2lsb24zLmdlb3JnZXRvd24uZWR1L35wZXRlcnNlZS9jZ2ktYmlu
87
+ L3dvcmRsb29rLmNnaT9zZWFyY2h0eXBlPWJpZzUmd2hlcmU9YW55d2hlcmUm
88
+ d29yZD0lRjNwJykiPjxpbWcgc3JjPSIvSW1nL2NlZGljdDJfbmV3LmdpZiIg
89
+ Ym9yZGVyPTA+PC9hPiA8YSBocmVmPSIjIiBvbkNsaWNrPSJyZWYoJy9jZ2kt
90
+ YmluL2FncmVwLWxpbmRpY3Q/cXVlcnk9JUYzcCZib29sZWFuPW5vJmNhc2U9
91
+ b24mY2F0ZWdvcnk9d2hvbGVyZWNvcmQnKSI+PGltZyBzcmM9Ii9JbWcvbGlu
92
+ ZGljdF9sb2dvLmdpZiIgYm9yZGVyPTA+PC9hPjwvdGQ+CiAgPC90cj4KPC90
93
+ YWJsZT4KPGZvcm0+PHRhYmxlIHdpZHRoPSIxMDAlIiBib3JkZXI9IjEiPgog
94
+ IDx0ciBiZ2NvbG9yPSNmZmYwYzI+IAogICAgPHRoIG5vd3JhcCB3aWR0aD0x
95
+ MDA+rbW4YDxicj48Zm9udCBzaXplPS0yPiitu7Tku3mopb7Hvse3fCk8L2Zv
96
+ bnQ+PC90aD4KICAgIDx0aCBub3dyYXAgd2lkdGg9MzA+uGY8YnI+rbU8L3Ro
97
+ PgogICAgPHRoIG5vd3JhcCB3aWR0aD03MD48Zm9udCBjb2xvcj0iZ3JheSIg
98
+ ZmFjZT0iV2luZ2RpbmdzIj4mYW1wOzwvZm9udD4grtq+2jwvdGg+CiAgICA8
99
+ dGggbm93cmFwIHdpZHRoPTEwMD6mUK21pnI8L3RoPgogICAgPHRoIG5vd3Jh
100
+ cCB3aWR0aD04MD6s28P2rbW4YDwvdGg+CiAgICA8dGggbm93cmFwPrX8qNIo
101
+ PGZvbnQgY29sb3I9bWFyb29uIHNpemU9LTE+uNHEwDwvZm9udD4pIC8gPGZv
102
+ bnQgY29sb3I9Zm9yZXN0Z3JlZW4gc2l6ZT0tMT6zxrX5PC9mb250PjwvdGg+
103
+ CiAgPC90cj4KICA8dHI+CiAgICA8dGQgbm93cmFwIGFsaWduPWNlbnRlcj48
104
+ Zm9udCBjb2xvcj1yZWQgc2l6ZT0rMT5vADwvZm9udD48Zm9udCBjb2xvcj1n
105
+ cmVlbiBzaXplPSsxPnU8L2ZvbnQ+PGZvbnQgY29sb3I9Ymx1ZSBzaXplPSsx
106
+ PjE8L2ZvbnQ+PC90ZD4KICAgIDx0ZCBhbGlnbj1jZW50ZXI+PGEgaHJlZj0i
107
+ c291bmQucGhwP3M9b3UxIiB0YXJnZXQ9c291bmQ+PGltZyBzcmM9ImltZy9z
108
+ b3VuZGVyLmdpZiIgYm9yZGVyPTA+PC9hPjwvdGQ+Cgk8dGQgbm93cmFwPjxm
109
+ b250IHNpemU9LTE+pEik5blxuuI8L2ZvbnQ+PC90ZD4KICAgIDx0ZCBub3dy
110
+ YXA+CjxhIGhyZWY9InNlYXJjaC5waHA/cT0lQzMlRUYiPsPvPC9hPiwgPGEg
111
+ aHJlZj0ic2VhcmNoLnBocD9xPSVGMmoiPvJqPC9hPgk8L3RkPgogICAgPHRk
112
+ PjxzZWxlY3Qgb25DaGFuZ2U9Ik1NX2p1bXBNZW51KCdzZWxmJyx0aGlzLDAp
113
+ Ij4KICAgIDxvcHRpb24gc2VsZWN0ZWQgdmFsdWU9IiMiPi0tv+++3C0tPC9v
114
+ cHRpb24+CiAgICA8b3B0aW9uIHZhbHVlPSJwaG8tcmVsLnBocD9zMT1vACZz
115
+ Mj11Ij6mUMFuplDD/Twvb3B0aW9uPgogICAgPG9wdGlvbiB2YWx1ZT0icGhv
116
+ LXJlbC5waHA/czI9dSZzMz0xIj6mUMP9plC91Twvb3B0aW9uPgogICAgPG9w
117
+ dGlvbiB2YWx1ZT0icGhvLXJlbC5waHA/czE9bwAmczM9MSI+plDBbqZQvdU8
118
+ L29wdGlvbj4KICA8L3NlbGVjdD48L3RkPgogICAgPHRkPjxkaXYgbm93cmFw
119
+ PjwvZGl2Pjxmb250IGNvbG9yPWZvcmVzdGdyZWVuIHNpemU9LTE+plChdTxh
120
+ IGhyZWY9InNlYXJjaC5waHA/cT0lQzMlRUYiPsPvPC9hPqF2pnI8L2ZvbnQ+
121
+ PC90ZD4KICA8L3RyPgogIDx0cj4KICAgIDx0ZCBub3dyYXAgYWxpZ249Y2Vu
122
+ dGVyPjxmb250IGNvbG9yPXJlZCBzaXplPSsxPmw8L2ZvbnQ+PGZvbnQgY29s
123
+ b3I9Z3JlZW4gc2l6ZT0rMT51azwvZm9udD48Zm9udCBjb2xvcj1ibHVlIHNp
124
+ emU9KzE+NjwvZm9udD48L3RkPgogICAgPHRkIGFsaWduPWNlbnRlcj48YSBo
125
+ cmVmPSJzb3VuZC5waHA/cz1sdWs2IiB0YXJnZXQ9c291bmQ+PGltZyBzcmM9
126
+ ImltZy9zb3VuZGVyLmdpZiIgYm9yZGVyPTA+PC9hPjwvdGQ+Cgk8dGQgbm93
127
+ cmFwPjxmb250IHNpemU9LTE+pEik5blxuuI8L2ZvbnQ+PC90ZD4KICAgIDx0
128
+ ZCBub3dyYXA+CjxhIGhyZWY9InNlYXJjaC5waHA/cT0lRUUlNUMiPu5cPC9h
129
+ PiwgPGEgaHJlZj0ic2VhcmNoLnBocD9xPSVERSVENyI+3tc8L2E+LCA8YSBo
130
+ cmVmPSJzZWFyY2gucGhwP3E9JURFJUY3Ij7e9zwvYT4gPGEgaHJlZj0icGhv
131
+ LXJlbC5waHA/czE9bCZzMj11ayZzMz02Ij48Zm9udCBzaXplPS0xPls0Ni4u
132
+ XTwvZm9udD48L2E+CTwvdGQ+CiAgICA8dGQ+PHNlbGVjdCBvbkNoYW5nZT0i
133
+ TU1fanVtcE1lbnUoJ3NlbGYnLHRoaXMsMCkiPgogICAgPG9wdGlvbiBzZWxl
134
+ Y3RlZCB2YWx1ZT0iIyI+LS2/777cLS08L29wdGlvbj4KICAgIDxvcHRpb24g
135
+ dmFsdWU9InBoby1yZWwucGhwP3MxPWwmczI9dWsiPqZQwW6mUMP9PC9vcHRp
136
+ b24+CiAgICA8b3B0aW9uIHZhbHVlPSJwaG8tcmVsLnBocD9zMj11ayZzMz02
137
+ Ij6mUMP9plC91Twvb3B0aW9uPgogICAgPG9wdGlvbiB2YWx1ZT0icGhvLXJl
138
+ bC5waHA/czE9bCZzMz02Ij6mUMFuplC91Twvb3B0aW9uPgogIDwvc2VsZWN0
139
+ PjwvdGQ+CiAgICA8dGQ+PGRpdiBub3dyYXA+uWTzcDwvZGl2PjwvdGQ+CiAg
140
+ PC90cj4KPC90YWJsZT48dGFibGUgd2lkdGg9MTAwJSBib3JkZXI9MCBjZWxs
141
+ c3BhY2luZz0wIGNlbGxwYWRkaW5nPTA+PHRyPjx0ZD48Zm9udCBzaXplPS0x
142
+ IGNvbG9yPWdyYXk+t2qvwaa4vMY6IDQ2MzU8L2ZvbnQ+PC90ZD48dGQgYWxp
143
+ Z249cmlnaHQ+PGZvbnQgc2l6ZT0tMT4oPGEgaHJlZj0iYWRtaW4vZWRpdC5w
144
+ aHA/bmV3PUVkaXQmcT0lRjNwIj663rJ6pEit+7FNpc6wzzwvYT4pPC9mb250
145
+ PjwvdGQ+PC90cj48L3RhYmxlPrB0t2bCSTo8YnI+PC9mb3JtPjxocj48Zm9u
146
+ dCBjb2xvcj1ncmF5PlVuaWNvZGU6IDwvZm9udD48YSBocmVmPSJodHRwOi8v
147
+ d3d3LnVuaWNvZGUub3JnL2NnaS1iaW4vR2V0VW5paGFuRGF0YS5wbD9jb2Rl
148
+ cG9pbnQ9OTNENSIgdGFyZ2V0PV9ibGFuaz48aW1nIHNyYz0iL0ltZy91bmlj
149
+ b2RlMi5naWYiIGJvcmRlcj0wIGFsaWduPWFic21pZGRsZT48L2E+IDxmb250
150
+ IHNpemU9LTEgY29sb3I9Z3JheT5VKzkzRDU8L2ZvbnQ+PHRhYmxlIGJvcmRl
151
+ cj0wIGNlbGxzcGFjaW5nPTUgY2VsbHBhZGRpbmc9NT48dHI+PHRkIGNsYXNz
152
+ PXQ+un67eaRqpnKo5To8L3RkPjx0ZCB3aWR0aD0xMDA+PGZvbnQgc2l6ZT0t
153
+ MT5QZy40MjUwPC9mb250PjwvdGQ+PHRkIGNsYXNzPXQ+tLazcbjcOjwvdGQ+
154
+ PHRkPjxmb250IHNpemU9LTE+bHU0IDwvZm9udD48L3RkPjwvdHI+PHRyPjx0
155
+ ZCBjbGFzcz10PrFkurOmcqjlOjwvdGQ+PHRkIHdpZHRoPTEwMD48Zm9udCBz
156
+ aXplPS0xPlBnLjEyNDcuMjkwPC9mb250PjwvdGQ+PHRkIGNsYXNzPXQ+rV7E
157
+ tjo8L3RkPjx0ZD48Zm9udCBzaXplPS0xPjwvZm9udD48L3RkPjwvdHI+PHRy
158
+ Pjx0ZCBjbGFzcz10Pk1hdHRoZXdzOjwvdGQ+PHRkIHdpZHRoPTEwMD48Zm9u
159
+ dCBzaXplPS0xPi08L2ZvbnQ+PC90ZD48dGQgYWxpZ249cmlnaHQ+PGZvbnQg
160
+ c2l6ZT0tMT48L2ZvbnQ+PC90ZD48dGQ+PC90ZD48L3RyPjwvdGFibGU+PC9i
161
+ b2R5PjwvaHRtbD4=
162
+ http_version:
163
+ recorded_at: Mon, 31 Mar 2014 10:24:06 GMT
164
+ - request:
165
+ method: get
166
+ uri: http://humanum.arts.cuhk.edu.hk/Lexis/lexi-can/search.php?q=%F3p
167
+ body:
168
+ encoding: US-ASCII
169
+ string: ''
170
+ headers:
171
+ Accept-Encoding:
172
+ - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
173
+ Accept:
174
+ - "*/*"
175
+ User-Agent:
176
+ - Ruby
177
+ response:
178
+ status:
179
+ code: 200
180
+ message: OK
181
+ headers:
182
+ Date:
183
+ - Mon, 31 Mar 2014 10:32:44 GMT
184
+ Server:
185
+ - Apache/2.2.15 (CentOS)
186
+ X-Powered-By:
187
+ - PHP/5.3.3
188
+ Content-Length:
189
+ - '5636'
190
+ Connection:
191
+ - close
192
+ Content-Type:
193
+ - text/html
194
+ body:
195
+ encoding: ASCII-8BIT
196
+ string: !binary |-
197
+ PGh0bWw+PGhlYWQ+PHRpdGxlPrhmu3m8Zq21sHS1/KZyrnc8L3RpdGxlPjxz
198
+ dHlsZSB0eXBlPSJ0ZXh0L2NzcyI+YSB7IHRleHQtZGVjb3JhdGlvbjogbm9u
199
+ ZX0gLnRleHQgeyBsaW5lLWhlaWdodDogMTUwJSB9PC9zdHlsZT48bWV0YSBo
200
+ dHRwLWVxdWl2PSJDb250ZW50LVR5cGUiIGNvbnRlbnQ9InRleHQvaHRtbDsg
201
+ Y2hhcnNldD1iaWc1Ij48c2NyaXB0IGxhbmd1YWdlPSJKYXZhU2NyaXB0Ij4K
202
+ PCEtLQpmdW5jdGlvbiBNTV9qdW1wTWVudSh0YXJnLHNlbE9iaixyZXN0b3Jl
203
+ KXsgLy92My4wCiAgZXZhbCh0YXJnKyIubG9jYXRpb249JyIrc2VsT2JqLm9w
204
+ dGlvbnNbc2VsT2JqLnNlbGVjdGVkSW5kZXhdLnZhbHVlKyInIik7CiAgaWYg
205
+ KHJlc3RvcmUpIHNlbE9iai5zZWxlY3RlZEluZGV4PTA7Cn0KZnVuY3Rpb24g
206
+ cmVmICh1cmwpIHsKICByZXdpbj13aW5kb3cub3Blbih1cmwsJ3JlZicsJ3Rv
207
+ b2Jhcj0wLHN0YXR1cz0wLHNjcm9sbGJhcnM9MSxyZXNpemFibGU9MSx3aWR0
208
+ aD02MDAsaGVpZ2h0PTMwMCcpOwogIHNldFRpbWVvdXQgKCdyZXdpbi5mb2N1
209
+ cygpJywgMTAwKTsKfQovLy0tPgo8L3NjcmlwdD4KPHN0eWxlIHR5cGU9InRl
210
+ eHQvY3NzIj4KLnQgeyBmb250LXNpemU6IDEzOyBub3dyYXA7IHRleHQtYWxp
211
+ Z246IHJpZ2h0OyBjb2xvcjogbmF2eX0KLnQyIHsgZm9udC1zaXplOiAxMzsg
212
+ bm93cmFwOyB0ZXh0LWFsaWduOiBsZWZ0fQoudDMgeyBmb250LXNpemU6IDEz
213
+ OyBub3dyYXA7IHRleHQtYWxpZ246IGNlbnRlcn0KLncgeyBmb250LXNpemU6
214
+ IDM2OyBmb250LXdlaWdodDogYm9sZDsgY29sb3I6IHJlZDsgdGV4dC1hbGln
215
+ bjogY2VudGVyIH0KPC9zdHlsZT4KPHNjcmlwdCBsYW5ndWFnZT0iSmF2YVNj
216
+ cmlwdCI+CmZ1bmN0aW9uIHhpZF9kb3duKFhpZCkgewoJaWYgKGRvY3VtZW50
217
+ LmFsbFtYaWRdLnN0eWxlLmRpc3BsYXkgPT0gIm5vbmUiKSB7CgkJZG9jdW1l
218
+ bnQuYWxsW1hpZF0uc3R5bGUuZGlzcGxheSA9ICJibG9jayI7Cgl9IGVsc2Ug
219
+ ewoJCWRvY3VtZW50LmFsbFtYaWRdLnN0eWxlLmRpc3BsYXkgPSAibm9uZSI7
220
+ Cgl9Cn0KPC9zY3JpcHQ+PC9oZWFkPjxib2R5IGJhY2tncm91bmQ9Ii9MZXhp
221
+ cy9sZXhpLWNhbi9pbWcvcHBiazAxNC5qcGciID48dGFibGUgd2lkdGg9IjEw
222
+ MCUiIGJvcmRlcj0iMCI+CiAgPHRyPiAKICAgIDx0ZCByb3dzcGFuPSIyIiBj
223
+ bGFzcz13PvNwPC90ZD4KICAgIDx0ZCBjbGFzcz10PrOhrbo6PC90ZD4KCQk8
224
+ dGQgY2xhc3M9dDI+PGEgaHJlZj0icmFkLXN0ci5waHA/cmFkPTE2NyI+PGlt
225
+ ZyBzcmM9ImltZy9yYWQvcmFkMTY3LmdpZiIgYm9yZGVyPTAgYWxpZ249YWJz
226
+ bWlkZGxlPiBbMTY3XTwvYT48L3RkPgogICAgPHRkIGNsYXNzPXQ+tae1ZTo8
227
+ L3RkPgoJCTx0ZCBjbGFzcz10Mj48YSBocmVmPSJyYWQtc3RyLnBocD9zdHI9
228
+ MTkiPjE5PC9hPjwvdGQ+CiAgICA8dGQgY2xhc3M9dD6mcq21pMDD/jo8L3Rk
229
+ PgoJCTx0ZCBjbGFzcz10MyBiZ2NvbG9yPXllbGxvdz48YSBocmVmPSJjbGFz
230
+ c2lmaWVkLnBocD9zdD0yIj6vfa21pnI8L2E+PC90ZD4KCQk8dGQgYWxpZ249
231
+ Y2VudGVyPjxhIGhyZWY9IiMiIG9uQ2xpY2s9InJlZignaHR0cDovL3pob25n
232
+ d2VuLmNvbS9kLzI0My94MTEyLmh0bScpIj48aW1nIHNyYz0iL0ltZy96aG9u
233
+ Z3B1LmpwZyIgYm9yZGVyPTA+PC9hPiA8IS0tYSBocmVmPSIjIiBvbkNsaWNr
234
+ PSJyZWYoJ2h0dHA6Ly8xNDAuMTExLjM0LjQ2L2NnaS1iaW4vZGljdC9uZXdz
235
+ ZWFyY2guY2dpP0RhdGFiYXNlPWRpY3QmUXVlcnlTY29wZT1OYW1lJlF1ZXJ5
236
+ Q29tbWFuZD1maW5kJkdyYXBoaWNXb3JkPXllcyZRdWVyeVN0cmluZz0lRjNw
237
+ JykiLS0+CgkJPGEgaHJlZj0iIyIgb25DbGljaz0icmVmKCdodHRwOi8vMTQw
238
+ LjExMS4zNC40Ni9jZ2ktYmluL25ld0RpY3QvZGljdC5zaD9jb25kPSVGM3Am
239
+ cGllY2VMZW49NTAmZmxkPTEmY2F0PSZ1a2V5PS02MjQ3MjExODgmc2VyaWFs
240
+ PTMmcmVjTm89MCZvcD0maW1nRm9udD0xJykiPgoJCTxpbWcgc3JjPSIvSW1n
241
+ L2d5Y2QyYS5naWYiIGJvcmRlcj0wPjwvYT48L3RkPgogIDwvdHI+CiAgPHRy
242
+ PiAKICAgIDx0ZCBjbGFzcz10PqRqpK29WDo8L3RkPgoJCTx0ZCBjbGFzcz10
243
+ Mj5GMzcwPC90ZD4KICAgIDx0ZCBjbGFzcz10Pq3cvmW9WDo8L3RkPgoJCTx0
244
+ ZCBjbGFzcz10Mj6q96Tgw/ik3zwvdGQ+CiAgICA8dGQgY2xhc3M9dD7AV6fH
245
+ IC8gwFemuDo8L3RkPgoJCTx0ZCBjbGFzcz10Mj4tIC8gMDwvdGQ+CiAgICA8
246
+ dGQgYWxpZ249Y2VudGVyPjxhIGhyZWY9IiMiIG9uQ2xpY2s9InJlZignaHR0
247
+ cDovL2Vwc2lsb24zLmdlb3JnZXRvd24uZWR1L35wZXRlcnNlZS9jZ2ktYmlu
248
+ L3dvcmRsb29rLmNnaT9zZWFyY2h0eXBlPWJpZzUmd2hlcmU9YW55d2hlcmUm
249
+ d29yZD0lRjNwJykiPjxpbWcgc3JjPSIvSW1nL2NlZGljdDJfbmV3LmdpZiIg
250
+ Ym9yZGVyPTA+PC9hPiA8YSBocmVmPSIjIiBvbkNsaWNrPSJyZWYoJy9jZ2kt
251
+ YmluL2FncmVwLWxpbmRpY3Q/cXVlcnk9JUYzcCZib29sZWFuPW5vJmNhc2U9
252
+ b24mY2F0ZWdvcnk9d2hvbGVyZWNvcmQnKSI+PGltZyBzcmM9Ii9JbWcvbGlu
253
+ ZGljdF9sb2dvLmdpZiIgYm9yZGVyPTA+PC9hPjwvdGQ+CiAgPC90cj4KPC90
254
+ YWJsZT4KPGZvcm0+PHRhYmxlIHdpZHRoPSIxMDAlIiBib3JkZXI9IjEiPgog
255
+ IDx0ciBiZ2NvbG9yPSNmZmYwYzI+IAogICAgPHRoIG5vd3JhcCB3aWR0aD0x
256
+ MDA+rbW4YDxicj48Zm9udCBzaXplPS0yPiitu7Tku3mopb7Hvse3fCk8L2Zv
257
+ bnQ+PC90aD4KICAgIDx0aCBub3dyYXAgd2lkdGg9MzA+uGY8YnI+rbU8L3Ro
258
+ PgogICAgPHRoIG5vd3JhcCB3aWR0aD03MD48Zm9udCBjb2xvcj0iZ3JheSIg
259
+ ZmFjZT0iV2luZ2RpbmdzIj4mYW1wOzwvZm9udD4grtq+2jwvdGg+CiAgICA8
260
+ dGggbm93cmFwIHdpZHRoPTEwMD6mUK21pnI8L3RoPgogICAgPHRoIG5vd3Jh
261
+ cCB3aWR0aD04MD6s28P2rbW4YDwvdGg+CiAgICA8dGggbm93cmFwPrX8qNIo
262
+ PGZvbnQgY29sb3I9bWFyb29uIHNpemU9LTE+uNHEwDwvZm9udD4pIC8gPGZv
263
+ bnQgY29sb3I9Zm9yZXN0Z3JlZW4gc2l6ZT0tMT6zxrX5PC9mb250PjwvdGg+
264
+ CiAgPC90cj4KICA8dHI+CiAgICA8dGQgbm93cmFwIGFsaWduPWNlbnRlcj48
265
+ Zm9udCBjb2xvcj1yZWQgc2l6ZT0rMT5vADwvZm9udD48Zm9udCBjb2xvcj1n
266
+ cmVlbiBzaXplPSsxPnU8L2ZvbnQ+PGZvbnQgY29sb3I9Ymx1ZSBzaXplPSsx
267
+ PjE8L2ZvbnQ+PC90ZD4KICAgIDx0ZCBhbGlnbj1jZW50ZXI+PGEgaHJlZj0i
268
+ c291bmQucGhwP3M9b3UxIiB0YXJnZXQ9c291bmQ+PGltZyBzcmM9ImltZy9z
269
+ b3VuZGVyLmdpZiIgYm9yZGVyPTA+PC9hPjwvdGQ+Cgk8dGQgbm93cmFwPjxm
270
+ b250IHNpemU9LTE+pEik5blxuuI8L2ZvbnQ+PC90ZD4KICAgIDx0ZCBub3dy
271
+ YXA+CjxhIGhyZWY9InNlYXJjaC5waHA/cT0lQzMlRUYiPsPvPC9hPiwgPGEg
272
+ aHJlZj0ic2VhcmNoLnBocD9xPSVGMmoiPvJqPC9hPgk8L3RkPgogICAgPHRk
273
+ PjxzZWxlY3Qgb25DaGFuZ2U9Ik1NX2p1bXBNZW51KCdzZWxmJyx0aGlzLDAp
274
+ Ij4KICAgIDxvcHRpb24gc2VsZWN0ZWQgdmFsdWU9IiMiPi0tv+++3C0tPC9v
275
+ cHRpb24+CiAgICA8b3B0aW9uIHZhbHVlPSJwaG8tcmVsLnBocD9zMT1vACZz
276
+ Mj11Ij6mUMFuplDD/Twvb3B0aW9uPgogICAgPG9wdGlvbiB2YWx1ZT0icGhv
277
+ LXJlbC5waHA/czI9dSZzMz0xIj6mUMP9plC91Twvb3B0aW9uPgogICAgPG9w
278
+ dGlvbiB2YWx1ZT0icGhvLXJlbC5waHA/czE9bwAmczM9MSI+plDBbqZQvdU8
279
+ L29wdGlvbj4KICA8L3NlbGVjdD48L3RkPgogICAgPHRkPjxkaXYgbm93cmFw
280
+ PjwvZGl2Pjxmb250IGNvbG9yPWZvcmVzdGdyZWVuIHNpemU9LTE+plChdTxh
281
+ IGhyZWY9InNlYXJjaC5waHA/cT0lQzMlRUYiPsPvPC9hPqF2pnI8L2ZvbnQ+
282
+ PC90ZD4KICA8L3RyPgogIDx0cj4KICAgIDx0ZCBub3dyYXAgYWxpZ249Y2Vu
283
+ dGVyPjxmb250IGNvbG9yPXJlZCBzaXplPSsxPmw8L2ZvbnQ+PGZvbnQgY29s
284
+ b3I9Z3JlZW4gc2l6ZT0rMT51azwvZm9udD48Zm9udCBjb2xvcj1ibHVlIHNp
285
+ emU9KzE+NjwvZm9udD48L3RkPgogICAgPHRkIGFsaWduPWNlbnRlcj48YSBo
286
+ cmVmPSJzb3VuZC5waHA/cz1sdWs2IiB0YXJnZXQ9c291bmQ+PGltZyBzcmM9
287
+ ImltZy9zb3VuZGVyLmdpZiIgYm9yZGVyPTA+PC9hPjwvdGQ+Cgk8dGQgbm93
288
+ cmFwPjxmb250IHNpemU9LTE+pEik5blxuuI8L2ZvbnQ+PC90ZD4KICAgIDx0
289
+ ZCBub3dyYXA+CjxhIGhyZWY9InNlYXJjaC5waHA/cT0lRUUlNUMiPu5cPC9h
290
+ PiwgPGEgaHJlZj0ic2VhcmNoLnBocD9xPSVERSVENyI+3tc8L2E+LCA8YSBo
291
+ cmVmPSJzZWFyY2gucGhwP3E9JURFJUY3Ij7e9zwvYT4gPGEgaHJlZj0icGhv
292
+ LXJlbC5waHA/czE9bCZzMj11ayZzMz02Ij48Zm9udCBzaXplPS0xPls0Ni4u
293
+ XTwvZm9udD48L2E+CTwvdGQ+CiAgICA8dGQ+PHNlbGVjdCBvbkNoYW5nZT0i
294
+ TU1fanVtcE1lbnUoJ3NlbGYnLHRoaXMsMCkiPgogICAgPG9wdGlvbiBzZWxl
295
+ Y3RlZCB2YWx1ZT0iIyI+LS2/777cLS08L29wdGlvbj4KICAgIDxvcHRpb24g
296
+ dmFsdWU9InBoby1yZWwucGhwP3MxPWwmczI9dWsiPqZQwW6mUMP9PC9vcHRp
297
+ b24+CiAgICA8b3B0aW9uIHZhbHVlPSJwaG8tcmVsLnBocD9zMj11ayZzMz02
298
+ Ij6mUMP9plC91Twvb3B0aW9uPgogICAgPG9wdGlvbiB2YWx1ZT0icGhvLXJl
299
+ bC5waHA/czE9bCZzMz02Ij6mUMFuplC91Twvb3B0aW9uPgogIDwvc2VsZWN0
300
+ PjwvdGQ+CiAgICA8dGQ+PGRpdiBub3dyYXA+uWTzcDwvZGl2PjwvdGQ+CiAg
301
+ PC90cj4KPC90YWJsZT48dGFibGUgd2lkdGg9MTAwJSBib3JkZXI9MCBjZWxs
302
+ c3BhY2luZz0wIGNlbGxwYWRkaW5nPTA+PHRyPjx0ZD48Zm9udCBzaXplPS0x
303
+ IGNvbG9yPWdyYXk+t2qvwaa4vMY6IDQ2NDA8L2ZvbnQ+PC90ZD48dGQgYWxp
304
+ Z249cmlnaHQ+PGZvbnQgc2l6ZT0tMT4oPGEgaHJlZj0iYWRtaW4vZWRpdC5w
305
+ aHA/bmV3PUVkaXQmcT0lRjNwIj663rJ6pEit+7FNpc6wzzwvYT4pPC9mb250
306
+ PjwvdGQ+PC90cj48L3RhYmxlPrB0t2bCSTo8YnI+PC9mb3JtPjxocj48Zm9u
307
+ dCBjb2xvcj1ncmF5PlVuaWNvZGU6IDwvZm9udD48YSBocmVmPSJodHRwOi8v
308
+ d3d3LnVuaWNvZGUub3JnL2NnaS1iaW4vR2V0VW5paGFuRGF0YS5wbD9jb2Rl
309
+ cG9pbnQ9OTNENSIgdGFyZ2V0PV9ibGFuaz48aW1nIHNyYz0iL0ltZy91bmlj
310
+ b2RlMi5naWYiIGJvcmRlcj0wIGFsaWduPWFic21pZGRsZT48L2E+IDxmb250
311
+ IHNpemU9LTEgY29sb3I9Z3JheT5VKzkzRDU8L2ZvbnQ+PHRhYmxlIGJvcmRl
312
+ cj0wIGNlbGxzcGFjaW5nPTUgY2VsbHBhZGRpbmc9NT48dHI+PHRkIGNsYXNz
313
+ PXQ+un67eaRqpnKo5To8L3RkPjx0ZCB3aWR0aD0xMDA+PGZvbnQgc2l6ZT0t
314
+ MT5QZy40MjUwPC9mb250PjwvdGQ+PHRkIGNsYXNzPXQ+tLazcbjcOjwvdGQ+
315
+ PHRkPjxmb250IHNpemU9LTE+bHU0IDwvZm9udD48L3RkPjwvdHI+PHRyPjx0
316
+ ZCBjbGFzcz10PrFkurOmcqjlOjwvdGQ+PHRkIHdpZHRoPTEwMD48Zm9udCBz
317
+ aXplPS0xPlBnLjEyNDcuMjkwPC9mb250PjwvdGQ+PHRkIGNsYXNzPXQ+rV7E
318
+ tjo8L3RkPjx0ZD48Zm9udCBzaXplPS0xPjwvZm9udD48L3RkPjwvdHI+PHRy
319
+ Pjx0ZCBjbGFzcz10Pk1hdHRoZXdzOjwvdGQ+PHRkIHdpZHRoPTEwMD48Zm9u
320
+ dCBzaXplPS0xPi08L2ZvbnQ+PC90ZD48dGQgYWxpZ249cmlnaHQ+PGZvbnQg
321
+ c2l6ZT0tMT48L2ZvbnQ+PC90ZD48dGQ+PC90ZD48L3RyPjwvdGFibGU+PC9i
322
+ b2R5PjwvaHRtbD4=
323
+ http_version:
324
+ recorded_at: Mon, 31 Mar 2014 10:33:12 GMT
325
+ recorded_with: VCR 2.8.0
@@ -64,5 +64,12 @@ describe Cantonese::Scraper::WordScraper do
64
64
  expect(word[:syllable][1][:examples]).to be_include("可歌可泣")
65
65
  end
66
66
 
67
+ it "should parse 鏕 properly" do
68
+ word = subject.crawl("鏕")
69
+ expect(word[:syllable]).to be_a(Array)
70
+ expect(word[:syllable][0][:full]).to eq("ou1")
71
+ expect(word[:syllable][1][:full]).to eq("luk6")
72
+ end
73
+
67
74
  end
68
75
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cantonese
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Chong
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: tidy_ffi
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -128,6 +142,7 @@ files:
128
142
  - lib/cantonese/version.rb
129
143
  - spec/fixtures/cassettes/Cantonese_Scraper_ClassifiedScraper/_crawl/should_fetch_list_of_classified_words.yml
130
144
  - spec/fixtures/cassettes/Cantonese_Scraper_RadicalScraper/_crawl/should_list_of_radicals.yml
145
+ - spec/fixtures/cassettes/Cantonese_Scraper_WordScraper/_crawl/should_parse_properly.yml
131
146
  - spec/fixtures/cassettes/Cantonese_Scraper_WordScraper/_crawl/should_return_detail_of_a_word.yml
132
147
  - spec/fixtures/cassettes/Cantonese_Scraper_WordScraper/_crawl/should_return_detail_of_a_word_with_multiple_sounds.yml
133
148
  - spec/scraper/classified_scraper_spec.rb
@@ -161,6 +176,7 @@ summary: Set of scraper and processor to fetch Cantonese data.
161
176
  test_files:
162
177
  - spec/fixtures/cassettes/Cantonese_Scraper_ClassifiedScraper/_crawl/should_fetch_list_of_classified_words.yml
163
178
  - spec/fixtures/cassettes/Cantonese_Scraper_RadicalScraper/_crawl/should_list_of_radicals.yml
179
+ - spec/fixtures/cassettes/Cantonese_Scraper_WordScraper/_crawl/should_parse_properly.yml
164
180
  - spec/fixtures/cassettes/Cantonese_Scraper_WordScraper/_crawl/should_return_detail_of_a_word.yml
165
181
  - spec/fixtures/cassettes/Cantonese_Scraper_WordScraper/_crawl/should_return_detail_of_a_word_with_multiple_sounds.yml
166
182
  - spec/scraper/classified_scraper_spec.rb