taiwanese_news_parser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +23 -0
- data/Rakefile +4 -0
- data/g0v.json +37 -0
- data/lib/taiwanese_news_parser/parser/apple_daily.rb +69 -0
- data/lib/taiwanese_news_parser/parser/china_times.rb +76 -0
- data/lib/taiwanese_news_parser/parser/cna.rb +59 -0
- data/lib/taiwanese_news_parser/parser/cts.rb +52 -0
- data/lib/taiwanese_news_parser/parser/ettoday.rb +53 -0
- data/lib/taiwanese_news_parser/parser/liberty_times.rb +66 -0
- data/lib/taiwanese_news_parser/parser/liberty_times_big5.rb +51 -0
- data/lib/taiwanese_news_parser/parser/now_news.rb +53 -0
- data/lib/taiwanese_news_parser/parser/tvbs.rb +46 -0
- data/lib/taiwanese_news_parser/parser/udn.rb +43 -0
- data/lib/taiwanese_news_parser/parser.rb +57 -0
- data/lib/taiwanese_news_parser/url_cleaner.rb +19 -0
- data/lib/taiwanese_news_parser/version.rb +3 -0
- data/lib/taiwanese_news_parser.rb +15 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_s1.html +484 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_s2.html +333 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_s3.html +334 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_spec.rb +57 -0
- data/spec/taiwanese_news_parser/parser/china_times_s1.html +513 -0
- data/spec/taiwanese_news_parser/parser/china_times_s2.html +538 -0
- data/spec/taiwanese_news_parser/parser/china_times_s3.html +893 -0
- data/spec/taiwanese_news_parser/parser/china_times_s4.html +1045 -0
- data/spec/taiwanese_news_parser/parser/china_times_spec.rb +63 -0
- data/spec/taiwanese_news_parser/parser/cna_s1.html +1616 -0
- data/spec/taiwanese_news_parser/parser/cna_spec.rb +33 -0
- data/spec/taiwanese_news_parser/parser/cts_s1.html +672 -0
- data/spec/taiwanese_news_parser/parser/cts_s2.html +672 -0
- data/spec/taiwanese_news_parser/parser/cts_spec.rb +36 -0
- data/spec/taiwanese_news_parser/parser/ettoday_s1.html +1817 -0
- data/spec/taiwanese_news_parser/parser/ettoday_s2.html +1822 -0
- data/spec/taiwanese_news_parser/parser/ettoday_spec.rb +35 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_big5_s1.html +213 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_big5_spec.rb +31 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_s1.html +145 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_spec.rb +29 -0
- data/spec/taiwanese_news_parser/parser/now_news_s1.html +968 -0
- data/spec/taiwanese_news_parser/parser/now_news_s2.html +986 -0
- data/spec/taiwanese_news_parser/parser/now_news_spec.rb +31 -0
- data/spec/taiwanese_news_parser/parser/tvbs_s1.html +734 -0
- data/spec/taiwanese_news_parser/parser/tvbs_s2.html +739 -0
- data/spec/taiwanese_news_parser/parser/tvbs_spec.rb +36 -0
- data/spec/taiwanese_news_parser/parser/udn_s1.html +1678 -0
- data/spec/taiwanese_news_parser/parser/udn_spec.rb +42 -0
- data/taiwanese_news_parser.gemspec +30 -0
- metadata +237 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 067e4fb14a386429fc75be6ad0ffd3169677d9a7
|
4
|
+
data.tar.gz: b3c9af0188c2df29573c725c09919530a1a8099c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5d6fde19818bfdf3a408dd985fec21b07dcf7a117a321a35e788bba3861cc8179f50203dea756cc8e841a1d3b3f7f1ece849f15a7084af987af9d63de54e1bf4
|
7
|
+
data.tar.gz: f32b846ff9bc138b611bb330bea80612f129ce62951f06d36a7fb953fa383f029bdbcea7007ff134eb6f613358cb773c84ca2e39d883e08df3dc445547e6e786
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 lulalala
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# TaiwaneseNewsParser
|
2
|
+
|
3
|
+
台灣各新聞網站新聞解析器
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
gem 'taiwanese_news_parser'
|
8
|
+
|
9
|
+
## Usage
|
10
|
+
|
11
|
+
TaiwaneseNewsParser.parse(url)
|
12
|
+
|
13
|
+
會回傳新聞資訊的 hash
|
14
|
+
|
15
|
+
## Contributing
|
16
|
+
|
17
|
+
想要協助的朋友可以幫忙為其他新聞網站寫解析器。實作細節請參考個別解析器以及[wiki](https://github.com/lulalala/taiwanese_news_parser/wiki)。
|
18
|
+
|
19
|
+
1. Fork it
|
20
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
21
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
22
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
23
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/g0v.json
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
{
|
2
|
+
"author": "lulalala",
|
3
|
+
"contributors": [],
|
4
|
+
"status": "alpha",
|
5
|
+
"name": "taiwanese_news_parser",
|
6
|
+
"name_zh": "台灣新聞網站解析器",
|
7
|
+
"description": "Parse Taiwanese based news agency website news articles",
|
8
|
+
"description_zh": "台灣新聞媒體網站的新聞解析庫",
|
9
|
+
"homepage": "https://github.com/lulalala/taiwanese_news_parser",
|
10
|
+
"document": "http://hack.g0v.tw/taiwanese_news_parser",
|
11
|
+
"repository": {
|
12
|
+
"type": "git",
|
13
|
+
"url": "https://github.com/lulalala/taiwanese_news_parser"
|
14
|
+
},
|
15
|
+
"licenses": [
|
16
|
+
{
|
17
|
+
"type": "MIT"
|
18
|
+
}
|
19
|
+
],
|
20
|
+
"keywords": [
|
21
|
+
"ruby"
|
22
|
+
],
|
23
|
+
"audience": [
|
24
|
+
"public"
|
25
|
+
],
|
26
|
+
"products": [
|
27
|
+
"library",
|
28
|
+
"api"
|
29
|
+
],
|
30
|
+
"projects": [
|
31
|
+
"taiwanese_news_parser"
|
32
|
+
],
|
33
|
+
"thumbnail": "",
|
34
|
+
"needs": [
|
35
|
+
"programmer"
|
36
|
+
]
|
37
|
+
}
|
@@ -0,0 +1,69 @@
|
|
1
|
+
class TaiwaneseNewsParser::Parser::AppleDaily < TaiwaneseNewsParser::Parser
|
2
|
+
def self.domain
|
3
|
+
'appledaily.com.tw'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.names
|
7
|
+
%w{蘋果日報}
|
8
|
+
end
|
9
|
+
|
10
|
+
def doc
|
11
|
+
@raw = open(url).read
|
12
|
+
@doc = Nokogiri::HTML(@raw)
|
13
|
+
end
|
14
|
+
|
15
|
+
#url = 'http://www.appledaily.com.tw/appledaily/article/headline/20130414/34951658'
|
16
|
+
def parse
|
17
|
+
@article[:title] = doc.at_css('#h1').text
|
18
|
+
|
19
|
+
@article[:company_name] = parse_company_name
|
20
|
+
|
21
|
+
@article[:content] = doc.css('.articulum').css('p,h2').text
|
22
|
+
|
23
|
+
@article[:reporter_name] = parse_reporter_name()
|
24
|
+
|
25
|
+
@article[:published_at] = self.class.parse_time(doc.css('.gggs time').text)
|
26
|
+
|
27
|
+
clean_up
|
28
|
+
|
29
|
+
@article
|
30
|
+
end
|
31
|
+
|
32
|
+
def parse_company_name
|
33
|
+
'蘋果日報'
|
34
|
+
end
|
35
|
+
|
36
|
+
def parse_reporter_name
|
37
|
+
text = doc.css('.articulum').css('p,h2').text.strip
|
38
|
+
if match = text.match(%r{◎記者(.+)$})
|
39
|
+
return reporter_name = match[1]
|
40
|
+
elsif match = text.match(%r{【(?:記者)?(.+?)[//╱]})
|
41
|
+
reporter_name = match[1]
|
42
|
+
end
|
43
|
+
reporter_name
|
44
|
+
end
|
45
|
+
|
46
|
+
def clean_url
|
47
|
+
@article[:url].gsub!(%r{/([^/]*)$},'')
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.parse_url_id(url)
|
51
|
+
# removes trailing slash
|
52
|
+
url[%r{http://www.appledaily\.com\.tw/\w+/article/\w+/((?:\d+/)+)},1][0..-2]
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.parse_time(raw_time)
|
56
|
+
valid_formats = ['%Y年%m月%d日%H:%M', '%Y年%m月%d日']
|
57
|
+
|
58
|
+
date = nil
|
59
|
+
valid_formats.each do |format|
|
60
|
+
begin
|
61
|
+
date = DateTime.strptime(raw_time, format)
|
62
|
+
rescue
|
63
|
+
end
|
64
|
+
break if !date.nil?
|
65
|
+
end
|
66
|
+
|
67
|
+
return date
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
class TaiwaneseNewsParser::Parser::ChinaTimes < TaiwaneseNewsParser::Parser
|
2
|
+
def self.domain
|
3
|
+
'chinatimes.com'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.names
|
7
|
+
%w{中國時報 中時電子報 工商時報 旺報 時報週刊 中天 中視 中廣}
|
8
|
+
end
|
9
|
+
|
10
|
+
def doc
|
11
|
+
@raw = open(url).read
|
12
|
+
@doc = Nokogiri::HTML(@raw)
|
13
|
+
end
|
14
|
+
|
15
|
+
#url = 'http://news.chinatimes.com/mainland/11050505/112013041400325.html'
|
16
|
+
#url = 'http://www.chinatimes.com/realtimenews/%E6%AD%BB%E4%BA%A1%E9%9B%B2%E9%9C%84%E9%A3%9B%E8%BB%8A-%E7%BE%8E%E5%A9%A6%E5%A2%9C%E8%90%BD%E8%BA%AB%E4%BA%A1-20130720002354-260408'
|
17
|
+
def parse
|
18
|
+
@article[:title] = doc.at_css('.page_container header h1').text
|
19
|
+
|
20
|
+
@article[:company_name] = parse_company_name
|
21
|
+
|
22
|
+
@article[:content] = doc.css('.page_container article>p').text
|
23
|
+
|
24
|
+
#@article[:web_published_at] = Time.parse(doc.at_css('#story_update').text)
|
25
|
+
|
26
|
+
@article[:reporter_name] = parse_reporter_name()
|
27
|
+
|
28
|
+
t = doc.css('.reporter time').text.match(/(\d*)年(\d*)月(\d*)日 (\d*):(\d*)/)
|
29
|
+
@article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
|
30
|
+
|
31
|
+
clean_up
|
32
|
+
|
33
|
+
@article
|
34
|
+
end
|
35
|
+
|
36
|
+
def parse_reporter_name
|
37
|
+
el = doc.at_css('.reporter a[rel=author]')
|
38
|
+
return el.text if el
|
39
|
+
|
40
|
+
text = doc.css('.reporter>text()').text
|
41
|
+
if match = text.match(%r{記者(.+?)[//╱/]})
|
42
|
+
reporter_name = match[1]
|
43
|
+
elsif match = text.match(%r{【(.+?)[//╱/]})
|
44
|
+
reporter_name = match[1]
|
45
|
+
else
|
46
|
+
reporter_name = text
|
47
|
+
end
|
48
|
+
reporter_name
|
49
|
+
end
|
50
|
+
|
51
|
+
def parse_company_name
|
52
|
+
n = doc.at_css('.reporter>a').text
|
53
|
+
if n == '時週精選'
|
54
|
+
n = '時報週刊'
|
55
|
+
elsif n == '新聞速報'
|
56
|
+
n = '中時電子報'
|
57
|
+
end
|
58
|
+
n
|
59
|
+
end
|
60
|
+
|
61
|
+
def clean_url
|
62
|
+
cleaner = TaiwaneseNewsParser::UrlCleaner.new('id')
|
63
|
+
@article[:url] = cleaner.clean(@article[:url])
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.parse_url_id(url)
|
67
|
+
url_id = url[%r{http://news\.chinatimes\.com/\w+/(\d+/\d+)},1]
|
68
|
+
if url_id.nil?
|
69
|
+
url_id = url[%r{[^-]*+[^-]*+-(\d+)-\d+},1]
|
70
|
+
end
|
71
|
+
if url_id.nil?
|
72
|
+
url_id = url[%r{chinatimes\.com/(.+)},1]
|
73
|
+
end
|
74
|
+
url_id
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
class TaiwaneseNewsParser::Parser::Cna < TaiwaneseNewsParser::Parser
|
2
|
+
def self.domain
|
3
|
+
'cna.com.tw'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.names
|
7
|
+
%{中央社}
|
8
|
+
end
|
9
|
+
|
10
|
+
def doc
|
11
|
+
@raw = open(url).read
|
12
|
+
@doc = Nokogiri::HTML(@raw)
|
13
|
+
end
|
14
|
+
|
15
|
+
#url = 'http://www.cna.com.tw/News/aSaM/201304120296-1.aspx'
|
16
|
+
def parse
|
17
|
+
@article[:title] = doc.at_css('.news_content h1').text
|
18
|
+
|
19
|
+
@article[:company_name] = '中央社'
|
20
|
+
|
21
|
+
@article[:content] = doc.css('.news_content .box_2').text
|
22
|
+
|
23
|
+
@article[:reporter_name] = parse_reporter_name()
|
24
|
+
|
25
|
+
match = doc.css('.news_content .box_2').text.strip.match( /(\d{3})(\d{2})(\d{2})/ )
|
26
|
+
date = []
|
27
|
+
date[0] = match[1].to_i + 1911
|
28
|
+
date[1] = match[2]
|
29
|
+
date[2] = match[3]
|
30
|
+
date_string = date.join('/') + ' ' + doc.css('.date').text
|
31
|
+
@article[:published_at] = Time.parse(date_string)
|
32
|
+
|
33
|
+
clean_up
|
34
|
+
|
35
|
+
@article
|
36
|
+
end
|
37
|
+
|
38
|
+
def parse_reporter_name
|
39
|
+
text = doc.css('.news_content .box_2').text
|
40
|
+
text = text[/(中央社(.*?)\d{1,2}日/,1]
|
41
|
+
cities = %w{台北 新北 台中 台南 高雄 基隆 新竹 嘉義 桃園 新竹 苗栗 彰化 南投 雲林 嘉義 屏東 宜蘭 花蓮 台東 澎湖 金門 連江}
|
42
|
+
cities.find do |city|
|
43
|
+
text.gsub!(/#{city}(?:縣市)?$/,'')
|
44
|
+
end
|
45
|
+
# TODO proper location name removal
|
46
|
+
if match = text.match(%r{記者(.+)})
|
47
|
+
reporter_name = match[1]
|
48
|
+
end
|
49
|
+
reporter_name
|
50
|
+
end
|
51
|
+
|
52
|
+
def reproduced?
|
53
|
+
false
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.parse_url_id(url)
|
57
|
+
url[%r{/(\d+)(?:\-\d)?\.},1]
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
class TaiwaneseNewsParser::Parser::Cts < TaiwaneseNewsParser::Parser
|
2
|
+
def self.domain
|
3
|
+
'cts.com.tw'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.names
|
7
|
+
['華視']
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.applicable?(url)
|
11
|
+
url.match(%r{cts\.com\.tw/})
|
12
|
+
end
|
13
|
+
|
14
|
+
def doc
|
15
|
+
@raw = open(url).read
|
16
|
+
@doc = Nokogiri::HTML(@raw)
|
17
|
+
end
|
18
|
+
|
19
|
+
#url = 'http://news.cts.com.tw/cts/politics/201403/201403191393958.html'
|
20
|
+
def parse
|
21
|
+
@article[:title] = doc.at_css('table h1').text
|
22
|
+
@article[:company_name] = parse_company_name
|
23
|
+
@article[:content] = doc.css('#ctscontent p').text
|
24
|
+
|
25
|
+
time = doc.at_css('td.style14 span.info').text[%r{^\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
|
26
|
+
@article[:published_at] = Time.parse("#{time}:00")
|
27
|
+
|
28
|
+
@article[:reporter_name] = parse_reporter_name()
|
29
|
+
|
30
|
+
clean_up
|
31
|
+
|
32
|
+
@article
|
33
|
+
end
|
34
|
+
|
35
|
+
def parse_reporter_name
|
36
|
+
text = doc.at_css('td.style14 span.info').text
|
37
|
+
text.gsub!(%r{^\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}},'')
|
38
|
+
text.gsub!(%r{地區.+$},'')
|
39
|
+
if text.include?('綜合報導')
|
40
|
+
return nil
|
41
|
+
end
|
42
|
+
text[%r{(.+) 報導},1]
|
43
|
+
end
|
44
|
+
|
45
|
+
def parse_company_name
|
46
|
+
doc.at_css('table table div[align="right"] a img').attr(:alt)
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.parse_url_id(url)
|
50
|
+
url[%r{/cts/.+/\d+/(\d+)\.html},1]
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
class TaiwaneseNewsParser::Parser::Ettoday < TaiwaneseNewsParser::Parser
|
2
|
+
def self.domain
|
3
|
+
'ettoday.net'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.names
|
7
|
+
%w{東森}
|
8
|
+
end
|
9
|
+
|
10
|
+
def doc
|
11
|
+
@raw = open(url).read
|
12
|
+
@doc = Nokogiri::HTML(@raw)
|
13
|
+
end
|
14
|
+
|
15
|
+
#url = 'http://www.ettoday.net/news/20130128/158005.htm'
|
16
|
+
def parse
|
17
|
+
@article[:title] = doc.css('[itemprop=headline]').text
|
18
|
+
|
19
|
+
@article[:company_name] = '東森'
|
20
|
+
|
21
|
+
@article[:content] = doc.css('[itemprop=articleBody]>p').text
|
22
|
+
|
23
|
+
@article[:reporter_name] = parse_reporter_name()
|
24
|
+
|
25
|
+
t = doc.css('.news-time').text.match(/(\d*)年(\d*)月(\d*)日 (\d*):(\d*)/)
|
26
|
+
@article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
|
27
|
+
|
28
|
+
clean_up
|
29
|
+
|
30
|
+
@article
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse_reporter_name
|
34
|
+
text = doc.css('[itemprop=articleBody]').text
|
35
|
+
if match = text.match(%r{記者(.+?)[//╱/]})
|
36
|
+
reporter_name = match[1]
|
37
|
+
end
|
38
|
+
reporter_name
|
39
|
+
end
|
40
|
+
|
41
|
+
def clean_url
|
42
|
+
cleaner = TaiwaneseNewsParser::UrlCleaner.new()
|
43
|
+
@article[:url] = cleaner.clean(@article[:url])
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.parse_url_id(url)
|
47
|
+
url[%r{http://www\.ettoday\.net/\w+/(\d+/\d+)},1]
|
48
|
+
end
|
49
|
+
|
50
|
+
def reproduced?
|
51
|
+
false
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
class TaiwaneseNewsParser::Parser::LibertyTimes < TaiwaneseNewsParser::Parser
|
2
|
+
def self.domain
|
3
|
+
'libertytimes.com.tw'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.names
|
7
|
+
%{自由時報}
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.applicable?(url)
|
11
|
+
url.match(%r{libertytimes\.com\.tw/liveNews/news\.php})
|
12
|
+
end
|
13
|
+
|
14
|
+
def doc
|
15
|
+
@raw = open(url).read
|
16
|
+
@doc = Nokogiri::HTML(@raw)
|
17
|
+
end
|
18
|
+
|
19
|
+
#url = 'http://www.libertytimes.com.tw/2013/new/apr/13/today-sp2.htm'
|
20
|
+
def parse
|
21
|
+
# new layout uses utf-8
|
22
|
+
@article[:title] = doc.at_css('#newsti text()').text
|
23
|
+
@article[:company_name] = parse_company_name
|
24
|
+
@article[:content] = doc.css('#newsc.news_content').text
|
25
|
+
|
26
|
+
time = doc.at_css('.conttime').text[%r{\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
|
27
|
+
if time.nil?
|
28
|
+
match = doc.at_css('.conttime').text.match(%r{(\d{2}):(\d{2})})
|
29
|
+
now = Time.now
|
30
|
+
today = Date.today
|
31
|
+
@article[:published_at] = Time.new(today.year, today.month, today.day, match[1].to_i, match[2].to_i)
|
32
|
+
else
|
33
|
+
@article[:published_at] = Time.parse("#{time}:00")
|
34
|
+
end
|
35
|
+
|
36
|
+
@article[:reporter_name] = parse_reporter_name()
|
37
|
+
|
38
|
+
clean_up
|
39
|
+
|
40
|
+
@article
|
41
|
+
end
|
42
|
+
|
43
|
+
def parse_reporter_name
|
44
|
+
if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
|
45
|
+
reporter_name = match[1][%r{記者(.+)},1]
|
46
|
+
elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
|
47
|
+
reporter_name = match[1]
|
48
|
+
elsif match = @article[:content].match(%r{(文/(.*?))})
|
49
|
+
reporter_name = match[1]
|
50
|
+
end
|
51
|
+
reporter_name
|
52
|
+
end
|
53
|
+
|
54
|
+
def parse_company_name
|
55
|
+
'自由時報'
|
56
|
+
end
|
57
|
+
|
58
|
+
def clean_url
|
59
|
+
cleaner = TaiwaneseNewsParser::UrlCleaner.new('no')
|
60
|
+
@article[:url] = cleaner.clean(@article[:url])
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.parse_url_id(url)
|
64
|
+
url[%r{news\.php\?no=(\d+)},1]
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
class TaiwaneseNewsParser::Parser::LibertyTimesBig5 < TaiwaneseNewsParser::Parser
|
2
|
+
def self.domain
|
3
|
+
'libertytimes.com.tw'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.names
|
7
|
+
%{自由時報}
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.applicable?(url)
|
11
|
+
url.match(%r{libertytimes\.com\.tw/\d+/\w+/\w+/\d+/.+\.htm})
|
12
|
+
end
|
13
|
+
|
14
|
+
#url = 'http://www.libertytimes.com.tw/2013/new/apr/13/today-sp2.htm'
|
15
|
+
def parse
|
16
|
+
@article[:title] = doc.at_css('#newtitle').text
|
17
|
+
@article[:company_name] = parse_company_name
|
18
|
+
@article[:content] = doc.css('#newsContent>span:not(#newtitle)>p:not(.picture)').text
|
19
|
+
|
20
|
+
@article[:reporter_name] = parse_reporter_name()
|
21
|
+
@article[:published_at] = Time.parse(doc.at_css('#date').text)
|
22
|
+
|
23
|
+
clean_up
|
24
|
+
|
25
|
+
@article
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse_reporter_name
|
29
|
+
if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
|
30
|
+
reporter_name = match[1][%r{記者(.+)},1]
|
31
|
+
elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
|
32
|
+
reporter_name = match[1]
|
33
|
+
elsif match = @article[:content].match(%r{(文/(.*?))})
|
34
|
+
reporter_name = match[1]
|
35
|
+
end
|
36
|
+
reporter_name
|
37
|
+
end
|
38
|
+
|
39
|
+
def parse_company_name
|
40
|
+
'自由時報'
|
41
|
+
end
|
42
|
+
|
43
|
+
def clean_url
|
44
|
+
cleaner = TaiwaneseNewsParser::UrlCleaner.new('')
|
45
|
+
@article[:url] = cleaner.clean(@article[:url])
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.parse_url_id(url)
|
49
|
+
url[%r{http://www\.libertytimes\.com\.tw/(.*)\.htm},1]
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
class TaiwaneseNewsParser::Parser::NowNews < TaiwaneseNewsParser::Parser
|
2
|
+
def self.domain
|
3
|
+
'nownews.com'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.names
|
7
|
+
%w{NowNews 今日新聞}
|
8
|
+
end
|
9
|
+
|
10
|
+
def doc
|
11
|
+
@raw = open(url).read
|
12
|
+
@doc = Nokogiri::HTML(@raw)
|
13
|
+
end
|
14
|
+
|
15
|
+
#url = 'http://www.nownews.com/n/2014/03/21/1159861'
|
16
|
+
def parse
|
17
|
+
@article[:title] = doc.css('[itemprop=headline]').text
|
18
|
+
|
19
|
+
@article[:company_name] = self.class.names.first
|
20
|
+
|
21
|
+
@article[:content] = doc.css('[itemprop=articleBody]>p').text
|
22
|
+
|
23
|
+
@article[:reporter_name] = parse_reporter_name()
|
24
|
+
|
25
|
+
t = doc.css('#reporter_info p').text.match(/(\d*)年\s*(\d+)月\s*(\d+)日\D*(\d+):(\d+)/)
|
26
|
+
@article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
|
27
|
+
|
28
|
+
clean_up
|
29
|
+
|
30
|
+
@article
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse_reporter_name
|
34
|
+
text = doc.css('[itemprop=articleBody]').text
|
35
|
+
if match = text.match(%r{記者(.+?)[//╱/]})
|
36
|
+
reporter_name = match[1]
|
37
|
+
end
|
38
|
+
reporter_name
|
39
|
+
end
|
40
|
+
|
41
|
+
def clean_url
|
42
|
+
cleaner = TaiwaneseNewsParser::UrlCleaner.new()
|
43
|
+
@article[:url] = cleaner.clean(@article[:url])
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.parse_url_id(url)
|
47
|
+
url[%r{/\d+/\d+/\d+/(\d+)},1]
|
48
|
+
end
|
49
|
+
|
50
|
+
def reproduced?
|
51
|
+
false
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
class TaiwaneseNewsParser::Parser::Tvbs < TaiwaneseNewsParser::Parser
|
2
|
+
def self.domain
|
3
|
+
'tvbs.com.tw'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.names
|
7
|
+
['TVBS']
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.applicable?(url)
|
11
|
+
url.match(%r{tvbs\.com\.tw/entry})
|
12
|
+
end
|
13
|
+
|
14
|
+
def doc
|
15
|
+
@raw = open(url).read
|
16
|
+
@doc = Nokogiri::HTML(@raw)
|
17
|
+
end
|
18
|
+
|
19
|
+
#url = 'http://news.tvbs.com.tw/entry/519673'
|
20
|
+
def parse
|
21
|
+
@article[:title] = doc.at_css('article h1').text
|
22
|
+
@article[:company_name] = parse_company_name
|
23
|
+
@article[:content] = doc.css('article .content').text
|
24
|
+
|
25
|
+
time = doc.at_css('article .meta-data .dateline').text[%r{時間:\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
|
26
|
+
@article[:published_at] = Time.parse("#{time}:00")
|
27
|
+
|
28
|
+
@article[:reporter_name] = parse_reporter_name()
|
29
|
+
|
30
|
+
clean_up
|
31
|
+
|
32
|
+
@article
|
33
|
+
end
|
34
|
+
|
35
|
+
def parse_reporter_name
|
36
|
+
doc.at_css('article .meta-data .reporter').text[%r{記者:(.+)},1]
|
37
|
+
end
|
38
|
+
|
39
|
+
def parse_company_name
|
40
|
+
self.class.names.first
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.parse_url_id(url)
|
44
|
+
url[%r{/entry/(\d+)},1]
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
class TaiwaneseNewsParser::Parser::Udn < TaiwaneseNewsParser::Parser
|
2
|
+
def self.domain
|
3
|
+
'udn.com'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.names
|
7
|
+
%{聯合報 聯合晚報}
|
8
|
+
end
|
9
|
+
|
10
|
+
#url = 'http://udn.com/NEWS/NATIONAL/NATS5/7807573.shtml'
|
11
|
+
def parse
|
12
|
+
@article[:title] = doc.at_css('#story_title').text
|
13
|
+
@article[:content] = doc.at_css('#story').text
|
14
|
+
|
15
|
+
#a.web_published_at = Time.parse(doc.at_css('#story_update').text)
|
16
|
+
|
17
|
+
@article[:company_name] = parse_company_name
|
18
|
+
@article[:reporter_name] = parse_reporter_name
|
19
|
+
|
20
|
+
@article[:published_at] = Time.parse(doc.at_css('#story_update').text)
|
21
|
+
|
22
|
+
clean_up
|
23
|
+
|
24
|
+
@article
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_company_name
|
28
|
+
get_company_name_and_reporter_name.match(%r{^(.*?)[//╱]})[1]
|
29
|
+
end
|
30
|
+
def parse_reporter_name
|
31
|
+
get_company_name_and_reporter_name[%r{[//╱]記者(.*)[//╱]},1]
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.parse_url_id(url)
|
35
|
+
url[%r{\w+/\w+/(\d+)},1]
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def get_company_name_and_reporter_name
|
41
|
+
doc.at_css('#story_author').text[%r{【(.*)】},1]
|
42
|
+
end
|
43
|
+
end
|