cantonese 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cd26a3b32ad12b765087bfacfc12d15872fb8449
4
- data.tar.gz: 237081e5067765b2687f85814cde13f52274c5bf
3
+ metadata.gz: 595eae84bbb6d22f686ff35e47026701317a997d
4
+ data.tar.gz: a84fa35b1f49dd7d2335ef4b049ad0ae185ae2a6
5
5
  SHA512:
6
- metadata.gz: 04730e07d52a91228cd157eaaf08ccc012fcc2ae453dc37bde26ffa7d5e413ebcef3b1d3c0fdf9491dd14bf47b34101a65a5c464d68ab785708ce5c08a0c7449
7
- data.tar.gz: 7cca9753f3bd2fcbd9e954252c33404c8030dacb48f8fc14753a5d513ea20c13e08b13ec070445663001953302ffda7933039bfde62ba584705dff0e856d880d
6
+ metadata.gz: f8d821bc1180244198ea430d1f88ae98b976709366337f12a53cd1ff03c0df93c9dfe40ec76ce12ee6d946f750e1c5680650c61cb39aa5aa54e9a27293233f5f
7
+ data.tar.gz: bfbd3904e6bd91cf378f02d684c2f7229a3aead90fa423f77567b6725cda75dba6444838c57d4f09ef304fef5540b6f4d53b9972d7c85ac269d2751891919f1a
@@ -13,10 +13,10 @@ module Cantonese
13
13
 
14
14
  # fetch and get the page in UTF8
15
15
  html = open(url).read
16
- html = html.encode('UTF-8', 'Big5', :invalid => :replace, :undef => :replace, :replace => '?')
17
- html = TidyFFI::Tidy.clean(html.gsub(/\0/, ''))
18
-
19
- doc = Nokogiri::HTML(html, nil, 'UTF-8')
16
+ html = html.encode('UTF-8', 'Big5', :invalid => :replace, :undef => :replace, :replace => '')
17
+ html = TidyFFI::Tidy.clean(html.gsub(/\0/, ''), :input_encoding => "utf8", :output_encoding => "utf8", :wrap => 0)
18
+
19
+ doc = Nokogiri::HTML(html, nil, 'UTF-8')
20
20
  word = doc.search(".w").first.text
21
21
 
22
22
  radical_id = doc.search("//*[@class = 't' and .='部首:']/following-sibling::td[1]").text.strip.tr('[] ', '').to_i rescue nil
@@ -25,7 +25,7 @@ module Cantonese
25
25
  big5 = doc.search("//*[@class = 't' and .='大五碼:']/following-sibling::td[1]").text rescue nil
26
26
  chanjie = doc.search("//*[@class = 't' and .='倉頡碼:']/following-sibling::td[1]").text rescue nil
27
27
  rank_and_frequency = doc.search("//*[@class = 't' and .='頻序 / 頻次:']/following-sibling::td[1]").text rescue nil
28
- combination = doc.search("//text()[.='配搭點:']/following-sibling::a").collect{|a| a.text}
28
+ combination = doc.search("//*[text()[contains(., '配搭點:')]]").search("a").select{|a| a["href"] =~ /^search/}.collect {|a| a.text.strip }
29
29
  rank, frequency = rank_and_frequency.split("/").collect{|word| word.strip.to_i }
30
30
 
31
31
  syllable = doc.search('//form/table[1]/tr[position()>1]').collect do |row|
@@ -42,7 +42,7 @@ module Cantonese
42
42
  example_text = nil
43
43
  note_text = note.text
44
44
  else
45
- example_text = example_or_note.text.gsub(%r{\[[0-9]+\.\.\]}, ', ').split(", ")
45
+ example_text = example_or_note.text.gsub(%r{\[[0-9]+\.\.\]}, ', ').split(", ").collect{|e| e.strip }
46
46
  note_text = nil
47
47
  end
48
48
 
@@ -1,3 +1,3 @@
1
1
  module Cantonese
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
@@ -34,7 +34,6 @@ describe Cantonese::Scraper::WordScraper do
34
34
  it "should return detail of a word with multiple sounds" do
35
35
  word = subject.crawl("可")
36
36
  expect(word).to be_a(Hash)
37
-
38
37
  expect(word[:text]).to eq("可")
39
38
 
40
39
  expect(word[:stroke]).to eq(5)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cantonese
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Chong