taiwanese_news_parser 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +23 -0
- data/Rakefile +4 -0
- data/g0v.json +37 -0
- data/lib/taiwanese_news_parser/parser/apple_daily.rb +69 -0
- data/lib/taiwanese_news_parser/parser/china_times.rb +76 -0
- data/lib/taiwanese_news_parser/parser/cna.rb +59 -0
- data/lib/taiwanese_news_parser/parser/cts.rb +52 -0
- data/lib/taiwanese_news_parser/parser/ettoday.rb +53 -0
- data/lib/taiwanese_news_parser/parser/liberty_times.rb +66 -0
- data/lib/taiwanese_news_parser/parser/liberty_times_big5.rb +51 -0
- data/lib/taiwanese_news_parser/parser/now_news.rb +53 -0
- data/lib/taiwanese_news_parser/parser/tvbs.rb +46 -0
- data/lib/taiwanese_news_parser/parser/udn.rb +43 -0
- data/lib/taiwanese_news_parser/parser.rb +57 -0
- data/lib/taiwanese_news_parser/url_cleaner.rb +19 -0
- data/lib/taiwanese_news_parser/version.rb +3 -0
- data/lib/taiwanese_news_parser.rb +15 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_s1.html +484 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_s2.html +333 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_s3.html +334 -0
- data/spec/taiwanese_news_parser/parser/apple_daily_spec.rb +57 -0
- data/spec/taiwanese_news_parser/parser/china_times_s1.html +513 -0
- data/spec/taiwanese_news_parser/parser/china_times_s2.html +538 -0
- data/spec/taiwanese_news_parser/parser/china_times_s3.html +893 -0
- data/spec/taiwanese_news_parser/parser/china_times_s4.html +1045 -0
- data/spec/taiwanese_news_parser/parser/china_times_spec.rb +63 -0
- data/spec/taiwanese_news_parser/parser/cna_s1.html +1616 -0
- data/spec/taiwanese_news_parser/parser/cna_spec.rb +33 -0
- data/spec/taiwanese_news_parser/parser/cts_s1.html +672 -0
- data/spec/taiwanese_news_parser/parser/cts_s2.html +672 -0
- data/spec/taiwanese_news_parser/parser/cts_spec.rb +36 -0
- data/spec/taiwanese_news_parser/parser/ettoday_s1.html +1817 -0
- data/spec/taiwanese_news_parser/parser/ettoday_s2.html +1822 -0
- data/spec/taiwanese_news_parser/parser/ettoday_spec.rb +35 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_big5_s1.html +213 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_big5_spec.rb +31 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_s1.html +145 -0
- data/spec/taiwanese_news_parser/parser/liberty_times_spec.rb +29 -0
- data/spec/taiwanese_news_parser/parser/now_news_s1.html +968 -0
- data/spec/taiwanese_news_parser/parser/now_news_s2.html +986 -0
- data/spec/taiwanese_news_parser/parser/now_news_spec.rb +31 -0
- data/spec/taiwanese_news_parser/parser/tvbs_s1.html +734 -0
- data/spec/taiwanese_news_parser/parser/tvbs_s2.html +739 -0
- data/spec/taiwanese_news_parser/parser/tvbs_spec.rb +36 -0
- data/spec/taiwanese_news_parser/parser/udn_s1.html +1678 -0
- data/spec/taiwanese_news_parser/parser/udn_spec.rb +42 -0
- data/taiwanese_news_parser.gemspec +30 -0
- metadata +237 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 067e4fb14a386429fc75be6ad0ffd3169677d9a7
|
|
4
|
+
data.tar.gz: b3c9af0188c2df29573c725c09919530a1a8099c
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 5d6fde19818bfdf3a408dd985fec21b07dcf7a117a321a35e788bba3861cc8179f50203dea756cc8e841a1d3b3f7f1ece849f15a7084af987af9d63de54e1bf4
|
|
7
|
+
data.tar.gz: f32b846ff9bc138b611bb330bea80612f129ce62951f06d36a7fb953fa383f029bdbcea7007ff134eb6f613358cb773c84ca2e39d883e08df3dc445547e6e786
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Copyright (c) 2013 lulalala
|
|
2
|
+
|
|
3
|
+
MIT License
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
a copy of this software and associated documentation files (the
|
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
11
|
+
the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be
|
|
14
|
+
included in all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# TaiwaneseNewsParser
|
|
2
|
+
|
|
3
|
+
台灣各新聞網站新聞解析器
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
gem 'taiwanese_news_parser'
|
|
8
|
+
|
|
9
|
+
## Usage
|
|
10
|
+
|
|
11
|
+
TaiwaneseNewsParser.parse(url)
|
|
12
|
+
|
|
13
|
+
會回傳新聞資訊的 hash
|
|
14
|
+
|
|
15
|
+
## Contributing
|
|
16
|
+
|
|
17
|
+
想要協助的朋友可以幫忙為其他新聞網站寫解析器。實作細節請參考個別解析器以及[wiki](https://github.com/lulalala/taiwanese_news_parser/wiki)。
|
|
18
|
+
|
|
19
|
+
1. Fork it
|
|
20
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
|
21
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
|
22
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
|
23
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/g0v.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
{
|
|
2
|
+
"author": "lulalala",
|
|
3
|
+
"contributors": [],
|
|
4
|
+
"status": "alpha",
|
|
5
|
+
"name": "taiwanese_news_parser",
|
|
6
|
+
"name_zh": "台灣新聞網站解析器",
|
|
7
|
+
"description": "Parse Taiwanese based news agency website news articles",
|
|
8
|
+
"description_zh": "台灣新聞媒體網站的新聞解析庫",
|
|
9
|
+
"homepage": "https://github.com/lulalala/taiwanese_news_parser",
|
|
10
|
+
"document": "http://hack.g0v.tw/taiwanese_news_parser",
|
|
11
|
+
"repository": {
|
|
12
|
+
"type": "git",
|
|
13
|
+
"url": "https://github.com/lulalala/taiwanese_news_parser"
|
|
14
|
+
},
|
|
15
|
+
"licenses": [
|
|
16
|
+
{
|
|
17
|
+
"type": "MIT"
|
|
18
|
+
}
|
|
19
|
+
],
|
|
20
|
+
"keywords": [
|
|
21
|
+
"ruby"
|
|
22
|
+
],
|
|
23
|
+
"audience": [
|
|
24
|
+
"public"
|
|
25
|
+
],
|
|
26
|
+
"products": [
|
|
27
|
+
"library",
|
|
28
|
+
"api"
|
|
29
|
+
],
|
|
30
|
+
"projects": [
|
|
31
|
+
"taiwanese_news_parser"
|
|
32
|
+
],
|
|
33
|
+
"thumbnail": "",
|
|
34
|
+
"needs": [
|
|
35
|
+
"programmer"
|
|
36
|
+
]
|
|
37
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
class TaiwaneseNewsParser::Parser::AppleDaily < TaiwaneseNewsParser::Parser
|
|
2
|
+
def self.domain
|
|
3
|
+
'appledaily.com.tw'
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.names
|
|
7
|
+
%w{蘋果日報}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def doc
|
|
11
|
+
@raw = open(url).read
|
|
12
|
+
@doc = Nokogiri::HTML(@raw)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
#url = 'http://www.appledaily.com.tw/appledaily/article/headline/20130414/34951658'
|
|
16
|
+
def parse
|
|
17
|
+
@article[:title] = doc.at_css('#h1').text
|
|
18
|
+
|
|
19
|
+
@article[:company_name] = parse_company_name
|
|
20
|
+
|
|
21
|
+
@article[:content] = doc.css('.articulum').css('p,h2').text
|
|
22
|
+
|
|
23
|
+
@article[:reporter_name] = parse_reporter_name()
|
|
24
|
+
|
|
25
|
+
@article[:published_at] = self.class.parse_time(doc.css('.gggs time').text)
|
|
26
|
+
|
|
27
|
+
clean_up
|
|
28
|
+
|
|
29
|
+
@article
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def parse_company_name
|
|
33
|
+
'蘋果日報'
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def parse_reporter_name
|
|
37
|
+
text = doc.css('.articulum').css('p,h2').text.strip
|
|
38
|
+
if match = text.match(%r{◎記者(.+)$})
|
|
39
|
+
return reporter_name = match[1]
|
|
40
|
+
elsif match = text.match(%r{【(?:記者)?(.+?)[//╱]})
|
|
41
|
+
reporter_name = match[1]
|
|
42
|
+
end
|
|
43
|
+
reporter_name
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def clean_url
|
|
47
|
+
@article[:url].gsub!(%r{/([^/]*)$},'')
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def self.parse_url_id(url)
|
|
51
|
+
# removes trailing slash
|
|
52
|
+
url[%r{http://www.appledaily\.com\.tw/\w+/article/\w+/((?:\d+/)+)},1][0..-2]
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def self.parse_time(raw_time)
|
|
56
|
+
valid_formats = ['%Y年%m月%d日%H:%M', '%Y年%m月%d日']
|
|
57
|
+
|
|
58
|
+
date = nil
|
|
59
|
+
valid_formats.each do |format|
|
|
60
|
+
begin
|
|
61
|
+
date = DateTime.strptime(raw_time, format)
|
|
62
|
+
rescue
|
|
63
|
+
end
|
|
64
|
+
break if !date.nil?
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
return date
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
class TaiwaneseNewsParser::Parser::ChinaTimes < TaiwaneseNewsParser::Parser
|
|
2
|
+
def self.domain
|
|
3
|
+
'chinatimes.com'
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.names
|
|
7
|
+
%w{中國時報 中時電子報 工商時報 旺報 時報週刊 中天 中視 中廣}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def doc
|
|
11
|
+
@raw = open(url).read
|
|
12
|
+
@doc = Nokogiri::HTML(@raw)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
#url = 'http://news.chinatimes.com/mainland/11050505/112013041400325.html'
|
|
16
|
+
#url = 'http://www.chinatimes.com/realtimenews/%E6%AD%BB%E4%BA%A1%E9%9B%B2%E9%9C%84%E9%A3%9B%E8%BB%8A-%E7%BE%8E%E5%A9%A6%E5%A2%9C%E8%90%BD%E8%BA%AB%E4%BA%A1-20130720002354-260408'
|
|
17
|
+
def parse
|
|
18
|
+
@article[:title] = doc.at_css('.page_container header h1').text
|
|
19
|
+
|
|
20
|
+
@article[:company_name] = parse_company_name
|
|
21
|
+
|
|
22
|
+
@article[:content] = doc.css('.page_container article>p').text
|
|
23
|
+
|
|
24
|
+
#@article[:web_published_at] = Time.parse(doc.at_css('#story_update').text)
|
|
25
|
+
|
|
26
|
+
@article[:reporter_name] = parse_reporter_name()
|
|
27
|
+
|
|
28
|
+
t = doc.css('.reporter time').text.match(/(\d*)年(\d*)月(\d*)日 (\d*):(\d*)/)
|
|
29
|
+
@article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
|
|
30
|
+
|
|
31
|
+
clean_up
|
|
32
|
+
|
|
33
|
+
@article
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def parse_reporter_name
|
|
37
|
+
el = doc.at_css('.reporter a[rel=author]')
|
|
38
|
+
return el.text if el
|
|
39
|
+
|
|
40
|
+
text = doc.css('.reporter>text()').text
|
|
41
|
+
if match = text.match(%r{記者(.+?)[//╱/]})
|
|
42
|
+
reporter_name = match[1]
|
|
43
|
+
elsif match = text.match(%r{【(.+?)[//╱/]})
|
|
44
|
+
reporter_name = match[1]
|
|
45
|
+
else
|
|
46
|
+
reporter_name = text
|
|
47
|
+
end
|
|
48
|
+
reporter_name
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def parse_company_name
|
|
52
|
+
n = doc.at_css('.reporter>a').text
|
|
53
|
+
if n == '時週精選'
|
|
54
|
+
n = '時報週刊'
|
|
55
|
+
elsif n == '新聞速報'
|
|
56
|
+
n = '中時電子報'
|
|
57
|
+
end
|
|
58
|
+
n
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def clean_url
|
|
62
|
+
cleaner = TaiwaneseNewsParser::UrlCleaner.new('id')
|
|
63
|
+
@article[:url] = cleaner.clean(@article[:url])
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def self.parse_url_id(url)
|
|
67
|
+
url_id = url[%r{http://news\.chinatimes\.com/\w+/(\d+/\d+)},1]
|
|
68
|
+
if url_id.nil?
|
|
69
|
+
url_id = url[%r{[^-]*+[^-]*+-(\d+)-\d+},1]
|
|
70
|
+
end
|
|
71
|
+
if url_id.nil?
|
|
72
|
+
url_id = url[%r{chinatimes\.com/(.+)},1]
|
|
73
|
+
end
|
|
74
|
+
url_id
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
class TaiwaneseNewsParser::Parser::Cna < TaiwaneseNewsParser::Parser
|
|
2
|
+
def self.domain
|
|
3
|
+
'cna.com.tw'
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.names
|
|
7
|
+
%{中央社}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def doc
|
|
11
|
+
@raw = open(url).read
|
|
12
|
+
@doc = Nokogiri::HTML(@raw)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
#url = 'http://www.cna.com.tw/News/aSaM/201304120296-1.aspx'
|
|
16
|
+
def parse
|
|
17
|
+
@article[:title] = doc.at_css('.news_content h1').text
|
|
18
|
+
|
|
19
|
+
@article[:company_name] = '中央社'
|
|
20
|
+
|
|
21
|
+
@article[:content] = doc.css('.news_content .box_2').text
|
|
22
|
+
|
|
23
|
+
@article[:reporter_name] = parse_reporter_name()
|
|
24
|
+
|
|
25
|
+
match = doc.css('.news_content .box_2').text.strip.match( /(\d{3})(\d{2})(\d{2})/ )
|
|
26
|
+
date = []
|
|
27
|
+
date[0] = match[1].to_i + 1911
|
|
28
|
+
date[1] = match[2]
|
|
29
|
+
date[2] = match[3]
|
|
30
|
+
date_string = date.join('/') + ' ' + doc.css('.date').text
|
|
31
|
+
@article[:published_at] = Time.parse(date_string)
|
|
32
|
+
|
|
33
|
+
clean_up
|
|
34
|
+
|
|
35
|
+
@article
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def parse_reporter_name
|
|
39
|
+
text = doc.css('.news_content .box_2').text
|
|
40
|
+
text = text[/(中央社(.*?)\d{1,2}日/,1]
|
|
41
|
+
cities = %w{台北 新北 台中 台南 高雄 基隆 新竹 嘉義 桃園 新竹 苗栗 彰化 南投 雲林 嘉義 屏東 宜蘭 花蓮 台東 澎湖 金門 連江}
|
|
42
|
+
cities.find do |city|
|
|
43
|
+
text.gsub!(/#{city}(?:縣市)?$/,'')
|
|
44
|
+
end
|
|
45
|
+
# TODO proper location name removal
|
|
46
|
+
if match = text.match(%r{記者(.+)})
|
|
47
|
+
reporter_name = match[1]
|
|
48
|
+
end
|
|
49
|
+
reporter_name
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def reproduced?
|
|
53
|
+
false
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def self.parse_url_id(url)
|
|
57
|
+
url[%r{/(\d+)(?:\-\d)?\.},1]
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
class TaiwaneseNewsParser::Parser::Cts < TaiwaneseNewsParser::Parser
|
|
2
|
+
def self.domain
|
|
3
|
+
'cts.com.tw'
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.names
|
|
7
|
+
['華視']
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def self.applicable?(url)
|
|
11
|
+
url.match(%r{cts\.com\.tw/})
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def doc
|
|
15
|
+
@raw = open(url).read
|
|
16
|
+
@doc = Nokogiri::HTML(@raw)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
#url = 'http://news.cts.com.tw/cts/politics/201403/201403191393958.html'
|
|
20
|
+
def parse
|
|
21
|
+
@article[:title] = doc.at_css('table h1').text
|
|
22
|
+
@article[:company_name] = parse_company_name
|
|
23
|
+
@article[:content] = doc.css('#ctscontent p').text
|
|
24
|
+
|
|
25
|
+
time = doc.at_css('td.style14 span.info').text[%r{^\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
|
|
26
|
+
@article[:published_at] = Time.parse("#{time}:00")
|
|
27
|
+
|
|
28
|
+
@article[:reporter_name] = parse_reporter_name()
|
|
29
|
+
|
|
30
|
+
clean_up
|
|
31
|
+
|
|
32
|
+
@article
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def parse_reporter_name
|
|
36
|
+
text = doc.at_css('td.style14 span.info').text
|
|
37
|
+
text.gsub!(%r{^\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}},'')
|
|
38
|
+
text.gsub!(%r{地區.+$},'')
|
|
39
|
+
if text.include?('綜合報導')
|
|
40
|
+
return nil
|
|
41
|
+
end
|
|
42
|
+
text[%r{(.+) 報導},1]
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def parse_company_name
|
|
46
|
+
doc.at_css('table table div[align="right"] a img').attr(:alt)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def self.parse_url_id(url)
|
|
50
|
+
url[%r{/cts/.+/\d+/(\d+)\.html},1]
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
class TaiwaneseNewsParser::Parser::Ettoday < TaiwaneseNewsParser::Parser
|
|
2
|
+
def self.domain
|
|
3
|
+
'ettoday.net'
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.names
|
|
7
|
+
%w{東森}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def doc
|
|
11
|
+
@raw = open(url).read
|
|
12
|
+
@doc = Nokogiri::HTML(@raw)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
#url = 'http://www.ettoday.net/news/20130128/158005.htm'
|
|
16
|
+
def parse
|
|
17
|
+
@article[:title] = doc.css('[itemprop=headline]').text
|
|
18
|
+
|
|
19
|
+
@article[:company_name] = '東森'
|
|
20
|
+
|
|
21
|
+
@article[:content] = doc.css('[itemprop=articleBody]>p').text
|
|
22
|
+
|
|
23
|
+
@article[:reporter_name] = parse_reporter_name()
|
|
24
|
+
|
|
25
|
+
t = doc.css('.news-time').text.match(/(\d*)年(\d*)月(\d*)日 (\d*):(\d*)/)
|
|
26
|
+
@article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
|
|
27
|
+
|
|
28
|
+
clean_up
|
|
29
|
+
|
|
30
|
+
@article
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def parse_reporter_name
|
|
34
|
+
text = doc.css('[itemprop=articleBody]').text
|
|
35
|
+
if match = text.match(%r{記者(.+?)[//╱/]})
|
|
36
|
+
reporter_name = match[1]
|
|
37
|
+
end
|
|
38
|
+
reporter_name
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def clean_url
|
|
42
|
+
cleaner = TaiwaneseNewsParser::UrlCleaner.new()
|
|
43
|
+
@article[:url] = cleaner.clean(@article[:url])
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.parse_url_id(url)
|
|
47
|
+
url[%r{http://www\.ettoday\.net/\w+/(\d+/\d+)},1]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def reproduced?
|
|
51
|
+
false
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
class TaiwaneseNewsParser::Parser::LibertyTimes < TaiwaneseNewsParser::Parser
|
|
2
|
+
def self.domain
|
|
3
|
+
'libertytimes.com.tw'
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.names
|
|
7
|
+
%{自由時報}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def self.applicable?(url)
|
|
11
|
+
url.match(%r{libertytimes\.com\.tw/liveNews/news\.php})
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def doc
|
|
15
|
+
@raw = open(url).read
|
|
16
|
+
@doc = Nokogiri::HTML(@raw)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
#url = 'http://www.libertytimes.com.tw/2013/new/apr/13/today-sp2.htm'
|
|
20
|
+
def parse
|
|
21
|
+
# new layout uses utf-8
|
|
22
|
+
@article[:title] = doc.at_css('#newsti text()').text
|
|
23
|
+
@article[:company_name] = parse_company_name
|
|
24
|
+
@article[:content] = doc.css('#newsc.news_content').text
|
|
25
|
+
|
|
26
|
+
time = doc.at_css('.conttime').text[%r{\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
|
|
27
|
+
if time.nil?
|
|
28
|
+
match = doc.at_css('.conttime').text.match(%r{(\d{2}):(\d{2})})
|
|
29
|
+
now = Time.now
|
|
30
|
+
today = Date.today
|
|
31
|
+
@article[:published_at] = Time.new(today.year, today.month, today.day, match[1].to_i, match[2].to_i)
|
|
32
|
+
else
|
|
33
|
+
@article[:published_at] = Time.parse("#{time}:00")
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
@article[:reporter_name] = parse_reporter_name()
|
|
37
|
+
|
|
38
|
+
clean_up
|
|
39
|
+
|
|
40
|
+
@article
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def parse_reporter_name
|
|
44
|
+
if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
|
|
45
|
+
reporter_name = match[1][%r{記者(.+)},1]
|
|
46
|
+
elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
|
|
47
|
+
reporter_name = match[1]
|
|
48
|
+
elsif match = @article[:content].match(%r{(文/(.*?))})
|
|
49
|
+
reporter_name = match[1]
|
|
50
|
+
end
|
|
51
|
+
reporter_name
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def parse_company_name
|
|
55
|
+
'自由時報'
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def clean_url
|
|
59
|
+
cleaner = TaiwaneseNewsParser::UrlCleaner.new('no')
|
|
60
|
+
@article[:url] = cleaner.clean(@article[:url])
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def self.parse_url_id(url)
|
|
64
|
+
url[%r{news\.php\?no=(\d+)},1]
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
class TaiwaneseNewsParser::Parser::LibertyTimesBig5 < TaiwaneseNewsParser::Parser
|
|
2
|
+
def self.domain
|
|
3
|
+
'libertytimes.com.tw'
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.names
|
|
7
|
+
%{自由時報}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def self.applicable?(url)
|
|
11
|
+
url.match(%r{libertytimes\.com\.tw/\d+/\w+/\w+/\d+/.+\.htm})
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
#url = 'http://www.libertytimes.com.tw/2013/new/apr/13/today-sp2.htm'
|
|
15
|
+
def parse
|
|
16
|
+
@article[:title] = doc.at_css('#newtitle').text
|
|
17
|
+
@article[:company_name] = parse_company_name
|
|
18
|
+
@article[:content] = doc.css('#newsContent>span:not(#newtitle)>p:not(.picture)').text
|
|
19
|
+
|
|
20
|
+
@article[:reporter_name] = parse_reporter_name()
|
|
21
|
+
@article[:published_at] = Time.parse(doc.at_css('#date').text)
|
|
22
|
+
|
|
23
|
+
clean_up
|
|
24
|
+
|
|
25
|
+
@article
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def parse_reporter_name
|
|
29
|
+
if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
|
|
30
|
+
reporter_name = match[1][%r{記者(.+)},1]
|
|
31
|
+
elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
|
|
32
|
+
reporter_name = match[1]
|
|
33
|
+
elsif match = @article[:content].match(%r{(文/(.*?))})
|
|
34
|
+
reporter_name = match[1]
|
|
35
|
+
end
|
|
36
|
+
reporter_name
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def parse_company_name
|
|
40
|
+
'自由時報'
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def clean_url
|
|
44
|
+
cleaner = TaiwaneseNewsParser::UrlCleaner.new('')
|
|
45
|
+
@article[:url] = cleaner.clean(@article[:url])
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def self.parse_url_id(url)
|
|
49
|
+
url[%r{http://www\.libertytimes\.com\.tw/(.*)\.htm},1]
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
class TaiwaneseNewsParser::Parser::NowNews < TaiwaneseNewsParser::Parser
|
|
2
|
+
def self.domain
|
|
3
|
+
'nownews.com'
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.names
|
|
7
|
+
%w{NowNews 今日新聞}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def doc
|
|
11
|
+
@raw = open(url).read
|
|
12
|
+
@doc = Nokogiri::HTML(@raw)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
#url = 'http://www.nownews.com/n/2014/03/21/1159861'
|
|
16
|
+
def parse
|
|
17
|
+
@article[:title] = doc.css('[itemprop=headline]').text
|
|
18
|
+
|
|
19
|
+
@article[:company_name] = self.class.names.first
|
|
20
|
+
|
|
21
|
+
@article[:content] = doc.css('[itemprop=articleBody]>p').text
|
|
22
|
+
|
|
23
|
+
@article[:reporter_name] = parse_reporter_name()
|
|
24
|
+
|
|
25
|
+
t = doc.css('#reporter_info p').text.match(/(\d*)年\s*(\d+)月\s*(\d+)日\D*(\d+):(\d+)/)
|
|
26
|
+
@article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
|
|
27
|
+
|
|
28
|
+
clean_up
|
|
29
|
+
|
|
30
|
+
@article
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def parse_reporter_name
|
|
34
|
+
text = doc.css('[itemprop=articleBody]').text
|
|
35
|
+
if match = text.match(%r{記者(.+?)[//╱/]})
|
|
36
|
+
reporter_name = match[1]
|
|
37
|
+
end
|
|
38
|
+
reporter_name
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def clean_url
|
|
42
|
+
cleaner = TaiwaneseNewsParser::UrlCleaner.new()
|
|
43
|
+
@article[:url] = cleaner.clean(@article[:url])
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def self.parse_url_id(url)
|
|
47
|
+
url[%r{/\d+/\d+/\d+/(\d+)},1]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def reproduced?
|
|
51
|
+
false
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
class TaiwaneseNewsParser::Parser::Tvbs < TaiwaneseNewsParser::Parser
|
|
2
|
+
def self.domain
|
|
3
|
+
'tvbs.com.tw'
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.names
|
|
7
|
+
['TVBS']
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def self.applicable?(url)
|
|
11
|
+
url.match(%r{tvbs\.com\.tw/entry})
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def doc
|
|
15
|
+
@raw = open(url).read
|
|
16
|
+
@doc = Nokogiri::HTML(@raw)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
#url = 'http://news.tvbs.com.tw/entry/519673'
|
|
20
|
+
def parse
|
|
21
|
+
@article[:title] = doc.at_css('article h1').text
|
|
22
|
+
@article[:company_name] = parse_company_name
|
|
23
|
+
@article[:content] = doc.css('article .content').text
|
|
24
|
+
|
|
25
|
+
time = doc.at_css('article .meta-data .dateline').text[%r{時間:\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
|
|
26
|
+
@article[:published_at] = Time.parse("#{time}:00")
|
|
27
|
+
|
|
28
|
+
@article[:reporter_name] = parse_reporter_name()
|
|
29
|
+
|
|
30
|
+
clean_up
|
|
31
|
+
|
|
32
|
+
@article
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def parse_reporter_name
|
|
36
|
+
doc.at_css('article .meta-data .reporter').text[%r{記者:(.+)},1]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def parse_company_name
|
|
40
|
+
self.class.names.first
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def self.parse_url_id(url)
|
|
44
|
+
url[%r{/entry/(\d+)},1]
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
class TaiwaneseNewsParser::Parser::Udn < TaiwaneseNewsParser::Parser
|
|
2
|
+
def self.domain
|
|
3
|
+
'udn.com'
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.names
|
|
7
|
+
%{聯合報 聯合晚報}
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
#url = 'http://udn.com/NEWS/NATIONAL/NATS5/7807573.shtml'
|
|
11
|
+
def parse
|
|
12
|
+
@article[:title] = doc.at_css('#story_title').text
|
|
13
|
+
@article[:content] = doc.at_css('#story').text
|
|
14
|
+
|
|
15
|
+
#a.web_published_at = Time.parse(doc.at_css('#story_update').text)
|
|
16
|
+
|
|
17
|
+
@article[:company_name] = parse_company_name
|
|
18
|
+
@article[:reporter_name] = parse_reporter_name
|
|
19
|
+
|
|
20
|
+
@article[:published_at] = Time.parse(doc.at_css('#story_update').text)
|
|
21
|
+
|
|
22
|
+
clean_up
|
|
23
|
+
|
|
24
|
+
@article
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def parse_company_name
|
|
28
|
+
get_company_name_and_reporter_name.match(%r{^(.*?)[//╱]})[1]
|
|
29
|
+
end
|
|
30
|
+
def parse_reporter_name
|
|
31
|
+
get_company_name_and_reporter_name[%r{[//╱]記者(.*)[//╱]},1]
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.parse_url_id(url)
|
|
35
|
+
url[%r{\w+/\w+/(\d+)},1]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
def get_company_name_and_reporter_name
|
|
41
|
+
doc.at_css('#story_author').text[%r{【(.*)】},1]
|
|
42
|
+
end
|
|
43
|
+
end
|