cantonese 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cd26a3b32ad12b765087bfacfc12d15872fb8449
4
- data.tar.gz: 237081e5067765b2687f85814cde13f52274c5bf
3
+ metadata.gz: 595eae84bbb6d22f686ff35e47026701317a997d
4
+ data.tar.gz: a84fa35b1f49dd7d2335ef4b049ad0ae185ae2a6
5
5
  SHA512:
6
- metadata.gz: 04730e07d52a91228cd157eaaf08ccc012fcc2ae453dc37bde26ffa7d5e413ebcef3b1d3c0fdf9491dd14bf47b34101a65a5c464d68ab785708ce5c08a0c7449
7
- data.tar.gz: 7cca9753f3bd2fcbd9e954252c33404c8030dacb48f8fc14753a5d513ea20c13e08b13ec070445663001953302ffda7933039bfde62ba584705dff0e856d880d
6
+ metadata.gz: f8d821bc1180244198ea430d1f88ae98b976709366337f12a53cd1ff03c0df93c9dfe40ec76ce12ee6d946f750e1c5680650c61cb39aa5aa54e9a27293233f5f
7
+ data.tar.gz: bfbd3904e6bd91cf378f02d684c2f7229a3aead90fa423f77567b6725cda75dba6444838c57d4f09ef304fef5540b6f4d53b9972d7c85ac269d2751891919f1a
@@ -13,10 +13,10 @@ module Cantonese
13
13
 
14
14
  # fetch and get the page in UTF8
15
15
  html = open(url).read
16
- html = html.encode('UTF-8', 'Big5', :invalid => :replace, :undef => :replace, :replace => '?')
17
- html = TidyFFI::Tidy.clean(html.gsub(/\0/, ''))
18
-
19
- doc = Nokogiri::HTML(html, nil, 'UTF-8')
16
+ html = html.encode('UTF-8', 'Big5', :invalid => :replace, :undef => :replace, :replace => '')
17
+ html = TidyFFI::Tidy.clean(html.gsub(/\0/, ''), :input_encoding => "utf8", :output_encoding => "utf8", :wrap => 0)
18
+
19
+ doc = Nokogiri::HTML(html, nil, 'UTF-8')
20
20
  word = doc.search(".w").first.text
21
21
 
22
22
  radical_id = doc.search("//*[@class = 't' and .='部首:']/following-sibling::td[1]").text.strip.tr('[] ', '').to_i rescue nil
@@ -25,7 +25,7 @@ module Cantonese
25
25
  big5 = doc.search("//*[@class = 't' and .='大五碼:']/following-sibling::td[1]").text rescue nil
26
26
  chanjie = doc.search("//*[@class = 't' and .='倉頡碼:']/following-sibling::td[1]").text rescue nil
27
27
  rank_and_frequency = doc.search("//*[@class = 't' and .='頻序 / 頻次:']/following-sibling::td[1]").text rescue nil
28
- combination = doc.search("//text()[.='配搭點:']/following-sibling::a").collect{|a| a.text}
28
+ combination = doc.search("//*[text()[contains(., '配搭點:')]]").search("a").select{|a| a["href"] =~ /^search/}.collect {|a| a.text.strip }
29
29
  rank, frequency = rank_and_frequency.split("/").collect{|word| word.strip.to_i }
30
30
 
31
31
  syllable = doc.search('//form/table[1]/tr[position()>1]').collect do |row|
@@ -42,7 +42,7 @@ module Cantonese
42
42
  example_text = nil
43
43
  note_text = note.text
44
44
  else
45
- example_text = example_or_note.text.gsub(%r{\[[0-9]+\.\.\]}, ', ').split(", ")
45
+ example_text = example_or_note.text.gsub(%r{\[[0-9]+\.\.\]}, ', ').split(", ").collect{|e| e.strip }
46
46
  note_text = nil
47
47
  end
48
48
 
@@ -1,3 +1,3 @@
1
1
  module Cantonese
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
@@ -34,7 +34,6 @@ describe Cantonese::Scraper::WordScraper do
34
34
  it "should return detail of a word with multiple sounds" do
35
35
  word = subject.crawl("可")
36
36
  expect(word).to be_a(Hash)
37
-
38
37
  expect(word[:text]).to eq("可")
39
38
 
40
39
  expect(word[:stroke]).to eq(5)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cantonese
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Chong