taiwanese_news_parser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +23 -0
- data/Rakefile +4 -0
- data/g0v.json +37 -0
- data/lib/taiwanese_news_parser/parser/apple_daily.rb +69 -0
- data/lib/taiwanese_news_parser/parser/china_times.rb +76 -0
- data/lib/taiwanese_news_parser/parser/cna.rb +59 -0
- data/lib/taiwanese_news_parser/parser/cts.rb +52 -0
- data/lib/taiwanese_news_parser/parser/ettoday.rb +53 -0
- data/lib/taiwanese_news_parser/parser/liberty_times.rb +66 -0
- data/lib/taiwanese_news_parser/parser/liberty_times_big5.rb +51 -0
- data/lib/taiwanese_news_parser/parser/now_news.rb +53 -0
- data/lib/taiwanese_news_parser/parser/tvbs.rb +46 -0
- data/lib/taiwanese_news_parser/parser/udn.rb +43 -0
- data/lib/taiwanese_news_parser/parser.rb +57 -0
- data/lib/taiwanese_news_parser/url_cleaner.rb +19 -0
- data/lib/taiwanese_news_parser/version.rb +3 -0
- data/lib/taiwanese_news_parser.rb +15 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_s1.html +484 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_s2.html +333 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_s3.html +334 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_spec.rb +57 -0
- data/spec/taiwanese_news_parser/parser/china_times_s1.html +513 -0
- data/spec/taiwanese_news_parser/parser/china_times_s2.html +538 -0
- data/spec/taiwanese_news_parser/parser/china_times_s3.html +893 -0
- data/spec/taiwanese_news_parser/parser/china_times_s4.html +1045 -0
- data/spec/taiwanese_news_parser/parser/china_times_spec.rb +63 -0
- data/spec/taiwanese_news_parser/parser/cna_s1.html +1616 -0
- data/spec/taiwanese_news_parser/parser/cna_spec.rb +33 -0
- data/spec/taiwanese_news_parser/parser/cts_s1.html +672 -0
- data/spec/taiwanese_news_parser/parser/cts_s2.html +672 -0
- data/spec/taiwanese_news_parser/parser/cts_spec.rb +36 -0
- data/spec/taiwanese_news_parser/parser/ettoday_s1.html +1817 -0
- data/spec/taiwanese_news_parser/parser/ettoday_s2.html +1822 -0
- data/spec/taiwanese_news_parser/parser/ettoday_spec.rb +35 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_big5_s1.html +213 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_big5_spec.rb +31 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_s1.html +145 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_spec.rb +29 -0
- data/spec/taiwanese_news_parser/parser/now_news_s1.html +968 -0
- data/spec/taiwanese_news_parser/parser/now_news_s2.html +986 -0
- data/spec/taiwanese_news_parser/parser/now_news_spec.rb +31 -0
- data/spec/taiwanese_news_parser/parser/tvbs_s1.html +734 -0
- data/spec/taiwanese_news_parser/parser/tvbs_s2.html +739 -0
- data/spec/taiwanese_news_parser/parser/tvbs_spec.rb +36 -0
- data/spec/taiwanese_news_parser/parser/udn_s1.html +1678 -0
- data/spec/taiwanese_news_parser/parser/udn_spec.rb +42 -0
- data/taiwanese_news_parser.gemspec +30 -0
- metadata +237 -0
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe TaiwaneseNewsParser::Parser::ChinaTimes do
|
4
|
+
describe '#parse' do
|
5
|
+
it do
|
6
|
+
url = 'http://www.chinatimes.com/realtimenews/%E5%8A%A0%E6%B2%B9%E7%AB%99%E7%84%A1%E8%89%AF%E5%93%A1%E5%B7%A5-%E5%AD%B8%E5%A4%A7%E8%88%8C%E9%A0%AD%E8%AD%8F%E7%AC%91%E7%99%8C%E5%A9%A6-20130629002528-260402'
|
7
|
+
FakeWeb.register_uri(:get, url, body:sample(__FILE__,'china_times_s1.html'))
|
8
|
+
article = described_class.new(url).parse
|
9
|
+
article[:title].should == '「春盈號」第二艘救生筏尋獲時破損 2死15失蹤'
|
10
|
+
article[:content].should include('高雄籍漁船「春盈號」日前在印度洋海域作業時發生火警')
|
11
|
+
article[:company_name].should == '中廣'
|
12
|
+
article[:reporter_name].should == '溫蘭魁'
|
13
|
+
article[:published_at].should == Time.new(2013,6,29,18,24)
|
14
|
+
end
|
15
|
+
it do
|
16
|
+
url = 'http://www.chinatimes.com/newspapers/%E7%99%BD%E7%B1%B3%E6%8F%9B%E7%95%AA%E8%96%AF-%E9%A6%AC%E5%B8%82%E5%BA%9C%E6%9C%89%E5%85%A7%E9%AC%BC%EF%BC%9F-20130718000466-260102'
|
17
|
+
FakeWeb.register_uri(:get, url, body:sample(__FILE__,'china_times_s2.html'))
|
18
|
+
article = described_class.new(url).parse
|
19
|
+
article[:title].should == '白米換番薯 馬市府有內鬼?'
|
20
|
+
article[:content].should include('市民只能望著已換成台北富邦銀行的招牌,望樓興歎。')
|
21
|
+
article[:company_name].should == '中國時報'
|
22
|
+
article[:reporter_name].should == '張立勳'
|
23
|
+
article[:published_at].should == Time.new(2013,7,18,5,40)
|
24
|
+
end
|
25
|
+
it do
|
26
|
+
url = 'http://www.chinatimes.com/newspapers/20131127000637-260112'
|
27
|
+
FakeWeb.register_uri(:get, url, body:sample(__FILE__,'china_times_s3.html'))
|
28
|
+
article = described_class.new(url).parse
|
29
|
+
article[:title].should == '李安失望台灣商業片 不會進戲院'
|
30
|
+
article[:content].should include('金馬50風光落幕,擔任評審團主委的李安不負眾望,以國際化視野,評出最優秀作品。')
|
31
|
+
article[:company_name].should == '中國時報'
|
32
|
+
article[:reporter_name].should == '陳亭均'
|
33
|
+
article[:published_at].should == Time.new(2013,11,27,4,9)
|
34
|
+
end
|
35
|
+
it 'format' do
|
36
|
+
url = 'http://news.chinatimes.com/politics/11050202/112013122200105.html'
|
37
|
+
FakeWeb.register_uri(:get, url, body:sample(__FILE__,'china_times_s4.html'))
|
38
|
+
article = described_class.new(url).parse
|
39
|
+
article[:title].should == '柯文哲團隊 將網羅一九八五'
|
40
|
+
article[:content].should include('下一步,更有意號召公民一九八五等社會團體加入。')
|
41
|
+
article[:company_name].should == '中國時報'
|
42
|
+
article[:reporter_name].should == '朱真楷'
|
43
|
+
article[:published_at].should == Time.new(2013,12,22,4,9)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
describe '.parse_url_id' do
|
47
|
+
it 'old url' do
|
48
|
+
described_class.parse_url_id('http://news.chinatimes.com/mainland/11050505/112013041400325.html').should == '11050505/112013041400325'
|
49
|
+
end
|
50
|
+
it 'new url' do
|
51
|
+
described_class.parse_url_id('http://www.chinatimes.com/newspapers/%E9%BB%8E%E5%B7%B4%E5%AB%A9%E7%A6%81%E8%B3%BD-%E4%BA%9E%E9%8C%A6%E8%B3%BD%E5%89%A915%E9%9A%8A-20130720000861-260111').should == '20130720000861'
|
52
|
+
described_class.parse_url_id('http://www.chinatimes.com/realtimenews/%E9%9F%93%E4%BA%9E%E8%88%AA%E7%BD%B9%E9%9B%A3%E5%B0%91%E5%A5%B3-%E7%A2%BA%E5%AE%9A%E9%81%AD%E6%95%91%E8%AD%B7%E8%BB%8A%E8%BC%BE%E6%96%83-20130720002396-260401').should == '20130720002396'
|
53
|
+
end
|
54
|
+
it do
|
55
|
+
url = 'http://www.chinatimes.com/realtimenews/%E9%AB%98%E9%90%B510%EF%BC%8F1%E6%BC%B2%E5%83%B9-%E6%B6%88%E5%9F%BA%E6%9C%83%E6%89%B9%E4%B8%8D%E5%90%88%E7%90%86-20130816003789-260405'
|
56
|
+
described_class.parse_url_id(url).should == '20130816003789'
|
57
|
+
end
|
58
|
+
it do
|
59
|
+
url = 'http://www.chinatimes.com/realtimenews/%E4%B8%8A%E6%91%A9%E9%90%B5%E7%B4%84%E6%9C%83%E6%9B%9D%E5%85%89-%E9%99%B3%E6%BC%A2%E5%85%B8%E6%88%80%E6%83%85%E7%94%9F%E8%AE%8A-20130816002793-260404'
|
60
|
+
described_class.parse_url_id(url).should == '20130816002793'
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|