cantonese 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/cantonese/scraper/word_scraper.rb +6 -6
- data/lib/cantonese/version.rb +1 -1
- data/spec/scraper/word_scraper_spec.rb +0 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 595eae84bbb6d22f686ff35e47026701317a997d
|
4
|
+
data.tar.gz: a84fa35b1f49dd7d2335ef4b049ad0ae185ae2a6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f8d821bc1180244198ea430d1f88ae98b976709366337f12a53cd1ff03c0df93c9dfe40ec76ce12ee6d946f750e1c5680650c61cb39aa5aa54e9a27293233f5f
|
7
|
+
data.tar.gz: bfbd3904e6bd91cf378f02d684c2f7229a3aead90fa423f77567b6725cda75dba6444838c57d4f09ef304fef5540b6f4d53b9972d7c85ac269d2751891919f1a
|
@@ -13,10 +13,10 @@ module Cantonese
|
|
13
13
|
|
14
14
|
# fetch and get the page in UTF8
|
15
15
|
html = open(url).read
|
16
|
-
html = html.encode('UTF-8', 'Big5', :invalid => :replace, :undef => :replace, :replace => '
|
17
|
-
html = TidyFFI::Tidy.clean(html.gsub(/\0/, ''))
|
18
|
-
|
19
|
-
doc = Nokogiri::HTML(html, nil, 'UTF-8')
|
16
|
+
html = html.encode('UTF-8', 'Big5', :invalid => :replace, :undef => :replace, :replace => '')
|
17
|
+
html = TidyFFI::Tidy.clean(html.gsub(/\0/, ''), :input_encoding => "utf8", :output_encoding => "utf8", :wrap => 0)
|
18
|
+
|
19
|
+
doc = Nokogiri::HTML(html, nil, 'UTF-8')
|
20
20
|
word = doc.search(".w").first.text
|
21
21
|
|
22
22
|
radical_id = doc.search("//*[@class = 't' and .='部首:']/following-sibling::td[1]").text.strip.tr('[] ', '').to_i rescue nil
|
@@ -25,7 +25,7 @@ module Cantonese
|
|
25
25
|
big5 = doc.search("//*[@class = 't' and .='大五碼:']/following-sibling::td[1]").text rescue nil
|
26
26
|
chanjie = doc.search("//*[@class = 't' and .='倉頡碼:']/following-sibling::td[1]").text rescue nil
|
27
27
|
rank_and_frequency = doc.search("//*[@class = 't' and .='頻序 / 頻次:']/following-sibling::td[1]").text rescue nil
|
28
|
-
combination = doc.search("
|
28
|
+
combination = doc.search("//*[text()[contains(., '配搭點:')]]").search("a").select{|a| a["href"] =~ /^search/}.collect {|a| a.text.strip }
|
29
29
|
rank, frequency = rank_and_frequency.split("/").collect{|word| word.strip.to_i }
|
30
30
|
|
31
31
|
syllable = doc.search('//form/table[1]/tr[position()>1]').collect do |row|
|
@@ -42,7 +42,7 @@ module Cantonese
|
|
42
42
|
example_text = nil
|
43
43
|
note_text = note.text
|
44
44
|
else
|
45
|
-
example_text = example_or_note.text.gsub(%r{\[[0-9]+\.\.\]}, ', ').split(", ")
|
45
|
+
example_text = example_or_note.text.gsub(%r{\[[0-9]+\.\.\]}, ', ').split(", ").collect{|e| e.strip }
|
46
46
|
note_text = nil
|
47
47
|
end
|
48
48
|
|
data/lib/cantonese/version.rb
CHANGED