taiwanese_news_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +23 -0
  6. data/Rakefile +4 -0
  7. data/g0v.json +37 -0
  8. data/lib/taiwanese_news_parser/parser/apple_daily.rb +69 -0
  9. data/lib/taiwanese_news_parser/parser/china_times.rb +76 -0
  10. data/lib/taiwanese_news_parser/parser/cna.rb +59 -0
  11. data/lib/taiwanese_news_parser/parser/cts.rb +52 -0
  12. data/lib/taiwanese_news_parser/parser/ettoday.rb +53 -0
  13. data/lib/taiwanese_news_parser/parser/liberty_times.rb +66 -0
  14. data/lib/taiwanese_news_parser/parser/liberty_times_big5.rb +51 -0
  15. data/lib/taiwanese_news_parser/parser/now_news.rb +53 -0
  16. data/lib/taiwanese_news_parser/parser/tvbs.rb +46 -0
  17. data/lib/taiwanese_news_parser/parser/udn.rb +43 -0
  18. data/lib/taiwanese_news_parser/parser.rb +57 -0
  19. data/lib/taiwanese_news_parser/url_cleaner.rb +19 -0
  20. data/lib/taiwanese_news_parser/version.rb +3 -0
  21. data/lib/taiwanese_news_parser.rb +15 -0
  22. data/spec/spec_helper.rb +9 -0
  23. data/spec/taiwanese_news_parser/parser/apple_daily_s1.html +484 -0
  24. data/spec/taiwanese_news_parser/parser/apple_daily_s2.html +333 -0
  25. data/spec/taiwanese_news_parser/parser/apple_daily_s3.html +334 -0
  26. data/spec/taiwanese_news_parser/parser/apple_daily_spec.rb +57 -0
  27. data/spec/taiwanese_news_parser/parser/china_times_s1.html +513 -0
  28. data/spec/taiwanese_news_parser/parser/china_times_s2.html +538 -0
  29. data/spec/taiwanese_news_parser/parser/china_times_s3.html +893 -0
  30. data/spec/taiwanese_news_parser/parser/china_times_s4.html +1045 -0
  31. data/spec/taiwanese_news_parser/parser/china_times_spec.rb +63 -0
  32. data/spec/taiwanese_news_parser/parser/cna_s1.html +1616 -0
  33. data/spec/taiwanese_news_parser/parser/cna_spec.rb +33 -0
  34. data/spec/taiwanese_news_parser/parser/cts_s1.html +672 -0
  35. data/spec/taiwanese_news_parser/parser/cts_s2.html +672 -0
  36. data/spec/taiwanese_news_parser/parser/cts_spec.rb +36 -0
  37. data/spec/taiwanese_news_parser/parser/ettoday_s1.html +1817 -0
  38. data/spec/taiwanese_news_parser/parser/ettoday_s2.html +1822 -0
  39. data/spec/taiwanese_news_parser/parser/ettoday_spec.rb +35 -0
  40. data/spec/taiwanese_news_parser/parser/liberty_times_big5_s1.html +213 -0
  41. data/spec/taiwanese_news_parser/parser/liberty_times_big5_spec.rb +31 -0
  42. data/spec/taiwanese_news_parser/parser/liberty_times_s1.html +145 -0
  43. data/spec/taiwanese_news_parser/parser/liberty_times_spec.rb +29 -0
  44. data/spec/taiwanese_news_parser/parser/now_news_s1.html +968 -0
  45. data/spec/taiwanese_news_parser/parser/now_news_s2.html +986 -0
  46. data/spec/taiwanese_news_parser/parser/now_news_spec.rb +31 -0
  47. data/spec/taiwanese_news_parser/parser/tvbs_s1.html +734 -0
  48. data/spec/taiwanese_news_parser/parser/tvbs_s2.html +739 -0
  49. data/spec/taiwanese_news_parser/parser/tvbs_spec.rb +36 -0
  50. data/spec/taiwanese_news_parser/parser/udn_s1.html +1678 -0
  51. data/spec/taiwanese_news_parser/parser/udn_spec.rb +42 -0
  52. data/taiwanese_news_parser.gemspec +30 -0
  53. metadata +237 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 067e4fb14a386429fc75be6ad0ffd3169677d9a7
4
+ data.tar.gz: b3c9af0188c2df29573c725c09919530a1a8099c
5
+ SHA512:
6
+ metadata.gz: 5d6fde19818bfdf3a408dd985fec21b07dcf7a117a321a35e788bba3861cc8179f50203dea756cc8e841a1d3b3f7f1ece849f15a7084af987af9d63de54e1bf4
7
+ data.tar.gz: f32b846ff9bc138b611bb330bea80612f129ce62951f06d36a7fb953fa383f029bdbcea7007ff134eb6f613358cb773c84ca2e39d883e08df3dc445547e6e786
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in taiwanese_news_parser.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 lulalala
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,23 @@
1
+ # TaiwaneseNewsParser
2
+
3
+ 台灣各新聞網站新聞解析器
4
+
5
+ ## Installation
6
+
7
+ gem 'taiwanese_news_parser'
8
+
9
+ ## Usage
10
+
11
+ TaiwaneseNewsParser.parse(url)
12
+
13
+ 會回傳新聞資訊的 hash
14
+
15
+ ## Contributing
16
+
17
+ 想要協助的朋友可以幫忙為其他新聞網站寫解析器。實作細節請參考個別解析器以及[wiki](https://github.com/lulalala/taiwanese_news_parser/wiki)。
18
+
19
+ 1. Fork it
20
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
21
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
22
+ 4. Push to the branch (`git push origin my-new-feature`)
23
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new
data/g0v.json ADDED
@@ -0,0 +1,37 @@
1
+ {
2
+ "author": "lulalala",
3
+ "contributors": [],
4
+ "status": "alpha",
5
+ "name": "taiwanese_news_parser",
6
+ "name_zh": "台灣新聞網站解析器",
7
+ "description": "Parse Taiwanese based news agency website news articles",
8
+ "description_zh": "台灣新聞媒體網站的新聞解析庫",
9
+ "homepage": "https://github.com/lulalala/taiwanese_news_parser",
10
+ "document": "http://hack.g0v.tw/taiwanese_news_parser",
11
+ "repository": {
12
+ "type": "git",
13
+ "url": "https://github.com/lulalala/taiwanese_news_parser"
14
+ },
15
+ "licenses": [
16
+ {
17
+ "type": "MIT"
18
+ }
19
+ ],
20
+ "keywords": [
21
+ "ruby"
22
+ ],
23
+ "audience": [
24
+ "public"
25
+ ],
26
+ "products": [
27
+ "library",
28
+ "api"
29
+ ],
30
+ "projects": [
31
+ "taiwanese_news_parser"
32
+ ],
33
+ "thumbnail": "",
34
+ "needs": [
35
+ "programmer"
36
+ ]
37
+ }
@@ -0,0 +1,69 @@
1
+ class TaiwaneseNewsParser::Parser::AppleDaily < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'appledaily.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ %w{蘋果日報}
8
+ end
9
+
10
+ def doc
11
+ @raw = open(url).read
12
+ @doc = Nokogiri::HTML(@raw)
13
+ end
14
+
15
+ #url = 'http://www.appledaily.com.tw/appledaily/article/headline/20130414/34951658'
16
+ def parse
17
+ @article[:title] = doc.at_css('#h1').text
18
+
19
+ @article[:company_name] = parse_company_name
20
+
21
+ @article[:content] = doc.css('.articulum').css('p,h2').text
22
+
23
+ @article[:reporter_name] = parse_reporter_name()
24
+
25
+ @article[:published_at] = self.class.parse_time(doc.css('.gggs time').text)
26
+
27
+ clean_up
28
+
29
+ @article
30
+ end
31
+
32
+ def parse_company_name
33
+ '蘋果日報'
34
+ end
35
+
36
+ def parse_reporter_name
37
+ text = doc.css('.articulum').css('p,h2').text.strip
38
+ if match = text.match(%r{◎記者(.+)$})
39
+ return reporter_name = match[1]
40
+ elsif match = text.match(%r{【(?:記者)?(.+?)[//╱]})
41
+ reporter_name = match[1]
42
+ end
43
+ reporter_name
44
+ end
45
+
46
+ def clean_url
47
+ @article[:url].gsub!(%r{/([^/]*)$},'')
48
+ end
49
+
50
+ def self.parse_url_id(url)
51
+ # removes trailing slash
52
+ url[%r{http://www.appledaily\.com\.tw/\w+/article/\w+/((?:\d+/)+)},1][0..-2]
53
+ end
54
+
55
+ def self.parse_time(raw_time)
56
+ valid_formats = ['%Y年%m月%d日%H:%M', '%Y年%m月%d日']
57
+
58
+ date = nil
59
+ valid_formats.each do |format|
60
+ begin
61
+ date = DateTime.strptime(raw_time, format)
62
+ rescue
63
+ end
64
+ break if !date.nil?
65
+ end
66
+
67
+ return date
68
+ end
69
+ end
@@ -0,0 +1,76 @@
1
+ class TaiwaneseNewsParser::Parser::ChinaTimes < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'chinatimes.com'
4
+ end
5
+
6
+ def self.names
7
+ %w{中國時報 中時電子報 工商時報 旺報 時報週刊 中天 中視 中廣}
8
+ end
9
+
10
+ def doc
11
+ @raw = open(url).read
12
+ @doc = Nokogiri::HTML(@raw)
13
+ end
14
+
15
+ #url = 'http://news.chinatimes.com/mainland/11050505/112013041400325.html'
16
+ #url = 'http://www.chinatimes.com/realtimenews/%E6%AD%BB%E4%BA%A1%E9%9B%B2%E9%9C%84%E9%A3%9B%E8%BB%8A-%E7%BE%8E%E5%A9%A6%E5%A2%9C%E8%90%BD%E8%BA%AB%E4%BA%A1-20130720002354-260408'
17
+ def parse
18
+ @article[:title] = doc.at_css('.page_container header h1').text
19
+
20
+ @article[:company_name] = parse_company_name
21
+
22
+ @article[:content] = doc.css('.page_container article>p').text
23
+
24
+ #@article[:web_published_at] = Time.parse(doc.at_css('#story_update').text)
25
+
26
+ @article[:reporter_name] = parse_reporter_name()
27
+
28
+ t = doc.css('.reporter time').text.match(/(\d*)年(\d*)月(\d*)日 (\d*):(\d*)/)
29
+ @article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
30
+
31
+ clean_up
32
+
33
+ @article
34
+ end
35
+
36
+ def parse_reporter_name
37
+ el = doc.at_css('.reporter a[rel=author]')
38
+ return el.text if el
39
+
40
+ text = doc.css('.reporter>text()').text
41
+ if match = text.match(%r{記者(.+?)[//╱/]})
42
+ reporter_name = match[1]
43
+ elsif match = text.match(%r{【(.+?)[//╱/]})
44
+ reporter_name = match[1]
45
+ else
46
+ reporter_name = text
47
+ end
48
+ reporter_name
49
+ end
50
+
51
+ def parse_company_name
52
+ n = doc.at_css('.reporter>a').text
53
+ if n == '時週精選'
54
+ n = '時報週刊'
55
+ elsif n == '新聞速報'
56
+ n = '中時電子報'
57
+ end
58
+ n
59
+ end
60
+
61
+ def clean_url
62
+ cleaner = TaiwaneseNewsParser::UrlCleaner.new('id')
63
+ @article[:url] = cleaner.clean(@article[:url])
64
+ end
65
+
66
+ def self.parse_url_id(url)
67
+ url_id = url[%r{http://news\.chinatimes\.com/\w+/(\d+/\d+)},1]
68
+ if url_id.nil?
69
+ url_id = url[%r{[^-]*+[^-]*+-(\d+)-\d+},1]
70
+ end
71
+ if url_id.nil?
72
+ url_id = url[%r{chinatimes\.com/(.+)},1]
73
+ end
74
+ url_id
75
+ end
76
+ end
@@ -0,0 +1,59 @@
1
+ class TaiwaneseNewsParser::Parser::Cna < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'cna.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ %{中央社}
8
+ end
9
+
10
+ def doc
11
+ @raw = open(url).read
12
+ @doc = Nokogiri::HTML(@raw)
13
+ end
14
+
15
+ #url = 'http://www.cna.com.tw/News/aSaM/201304120296-1.aspx'
16
+ def parse
17
+ @article[:title] = doc.at_css('.news_content h1').text
18
+
19
+ @article[:company_name] = '中央社'
20
+
21
+ @article[:content] = doc.css('.news_content .box_2').text
22
+
23
+ @article[:reporter_name] = parse_reporter_name()
24
+
25
+ match = doc.css('.news_content .box_2').text.strip.match( /(\d{3})(\d{2})(\d{2})/ )
26
+ date = []
27
+ date[0] = match[1].to_i + 1911
28
+ date[1] = match[2]
29
+ date[2] = match[3]
30
+ date_string = date.join('/') + ' ' + doc.css('.date').text
31
+ @article[:published_at] = Time.parse(date_string)
32
+
33
+ clean_up
34
+
35
+ @article
36
+ end
37
+
38
+ def parse_reporter_name
39
+ text = doc.css('.news_content .box_2').text
40
+ text = text[/(中央社(.*?)\d{1,2}日/,1]
41
+ cities = %w{台北 新北 台中 台南 高雄 基隆 新竹 嘉義 桃園 新竹 苗栗 彰化 南投 雲林 嘉義 屏東 宜蘭 花蓮 台東 澎湖 金門 連江}
42
+ cities.find do |city|
43
+ text.gsub!(/#{city}(?:縣市)?$/,'')
44
+ end
45
+ # TODO proper location name removal
46
+ if match = text.match(%r{記者(.+)})
47
+ reporter_name = match[1]
48
+ end
49
+ reporter_name
50
+ end
51
+
52
+ def reproduced?
53
+ false
54
+ end
55
+
56
+ def self.parse_url_id(url)
57
+ url[%r{/(\d+)(?:\-\d)?\.},1]
58
+ end
59
+ end
@@ -0,0 +1,52 @@
1
+ class TaiwaneseNewsParser::Parser::Cts < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'cts.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ ['華視']
8
+ end
9
+
10
+ def self.applicable?(url)
11
+ url.match(%r{cts\.com\.tw/})
12
+ end
13
+
14
+ def doc
15
+ @raw = open(url).read
16
+ @doc = Nokogiri::HTML(@raw)
17
+ end
18
+
19
+ #url = 'http://news.cts.com.tw/cts/politics/201403/201403191393958.html'
20
+ def parse
21
+ @article[:title] = doc.at_css('table h1').text
22
+ @article[:company_name] = parse_company_name
23
+ @article[:content] = doc.css('#ctscontent p').text
24
+
25
+ time = doc.at_css('td.style14 span.info').text[%r{^\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
26
+ @article[:published_at] = Time.parse("#{time}:00")
27
+
28
+ @article[:reporter_name] = parse_reporter_name()
29
+
30
+ clean_up
31
+
32
+ @article
33
+ end
34
+
35
+ def parse_reporter_name
36
+ text = doc.at_css('td.style14 span.info').text
37
+ text.gsub!(%r{^\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}},'')
38
+ text.gsub!(%r{地區.+$},'')
39
+ if text.include?('綜合報導')
40
+ return nil
41
+ end
42
+ text[%r{(.+) 報導},1]
43
+ end
44
+
45
+ def parse_company_name
46
+ doc.at_css('table table div[align="right"] a img').attr(:alt)
47
+ end
48
+
49
+ def self.parse_url_id(url)
50
+ url[%r{/cts/.+/\d+/(\d+)\.html},1]
51
+ end
52
+ end
@@ -0,0 +1,53 @@
1
+ class TaiwaneseNewsParser::Parser::Ettoday < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'ettoday.net'
4
+ end
5
+
6
+ def self.names
7
+ %w{東森}
8
+ end
9
+
10
+ def doc
11
+ @raw = open(url).read
12
+ @doc = Nokogiri::HTML(@raw)
13
+ end
14
+
15
+ #url = 'http://www.ettoday.net/news/20130128/158005.htm'
16
+ def parse
17
+ @article[:title] = doc.css('[itemprop=headline]').text
18
+
19
+ @article[:company_name] = '東森'
20
+
21
+ @article[:content] = doc.css('[itemprop=articleBody]>p').text
22
+
23
+ @article[:reporter_name] = parse_reporter_name()
24
+
25
+ t = doc.css('.news-time').text.match(/(\d*)年(\d*)月(\d*)日 (\d*):(\d*)/)
26
+ @article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
27
+
28
+ clean_up
29
+
30
+ @article
31
+ end
32
+
33
+ def parse_reporter_name
34
+ text = doc.css('[itemprop=articleBody]').text
35
+ if match = text.match(%r{記者(.+?)[//╱/]})
36
+ reporter_name = match[1]
37
+ end
38
+ reporter_name
39
+ end
40
+
41
+ def clean_url
42
+ cleaner = TaiwaneseNewsParser::UrlCleaner.new()
43
+ @article[:url] = cleaner.clean(@article[:url])
44
+ end
45
+
46
+ def self.parse_url_id(url)
47
+ url[%r{http://www\.ettoday\.net/\w+/(\d+/\d+)},1]
48
+ end
49
+
50
+ def reproduced?
51
+ false
52
+ end
53
+ end
@@ -0,0 +1,66 @@
1
+ class TaiwaneseNewsParser::Parser::LibertyTimes < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'libertytimes.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ %{自由時報}
8
+ end
9
+
10
+ def self.applicable?(url)
11
+ url.match(%r{libertytimes\.com\.tw/liveNews/news\.php})
12
+ end
13
+
14
+ def doc
15
+ @raw = open(url).read
16
+ @doc = Nokogiri::HTML(@raw)
17
+ end
18
+
19
+ #url = 'http://www.libertytimes.com.tw/2013/new/apr/13/today-sp2.htm'
20
+ def parse
21
+ # new layout uses utf-8
22
+ @article[:title] = doc.at_css('#newsti text()').text
23
+ @article[:company_name] = parse_company_name
24
+ @article[:content] = doc.css('#newsc.news_content').text
25
+
26
+ time = doc.at_css('.conttime').text[%r{\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
27
+ if time.nil?
28
+ match = doc.at_css('.conttime').text.match(%r{(\d{2}):(\d{2})})
29
+ now = Time.now
30
+ today = Date.today
31
+ @article[:published_at] = Time.new(today.year, today.month, today.day, match[1].to_i, match[2].to_i)
32
+ else
33
+ @article[:published_at] = Time.parse("#{time}:00")
34
+ end
35
+
36
+ @article[:reporter_name] = parse_reporter_name()
37
+
38
+ clean_up
39
+
40
+ @article
41
+ end
42
+
43
+ def parse_reporter_name
44
+ if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
45
+ reporter_name = match[1][%r{記者(.+)},1]
46
+ elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
47
+ reporter_name = match[1]
48
+ elsif match = @article[:content].match(%r{(文/(.*?))})
49
+ reporter_name = match[1]
50
+ end
51
+ reporter_name
52
+ end
53
+
54
+ def parse_company_name
55
+ '自由時報'
56
+ end
57
+
58
+ def clean_url
59
+ cleaner = TaiwaneseNewsParser::UrlCleaner.new('no')
60
+ @article[:url] = cleaner.clean(@article[:url])
61
+ end
62
+
63
+ def self.parse_url_id(url)
64
+ url[%r{news\.php\?no=(\d+)},1]
65
+ end
66
+ end
@@ -0,0 +1,51 @@
1
+ class TaiwaneseNewsParser::Parser::LibertyTimesBig5 < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'libertytimes.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ %{自由時報}
8
+ end
9
+
10
+ def self.applicable?(url)
11
+ url.match(%r{libertytimes\.com\.tw/\d+/\w+/\w+/\d+/.+\.htm})
12
+ end
13
+
14
+ #url = 'http://www.libertytimes.com.tw/2013/new/apr/13/today-sp2.htm'
15
+ def parse
16
+ @article[:title] = doc.at_css('#newtitle').text
17
+ @article[:company_name] = parse_company_name
18
+ @article[:content] = doc.css('#newsContent>span:not(#newtitle)>p:not(.picture)').text
19
+
20
+ @article[:reporter_name] = parse_reporter_name()
21
+ @article[:published_at] = Time.parse(doc.at_css('#date').text)
22
+
23
+ clean_up
24
+
25
+ @article
26
+ end
27
+
28
+ def parse_reporter_name
29
+ if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
30
+ reporter_name = match[1][%r{記者(.+)},1]
31
+ elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
32
+ reporter_name = match[1]
33
+ elsif match = @article[:content].match(%r{(文/(.*?))})
34
+ reporter_name = match[1]
35
+ end
36
+ reporter_name
37
+ end
38
+
39
+ def parse_company_name
40
+ '自由時報'
41
+ end
42
+
43
+ def clean_url
44
+ cleaner = TaiwaneseNewsParser::UrlCleaner.new('')
45
+ @article[:url] = cleaner.clean(@article[:url])
46
+ end
47
+
48
+ def self.parse_url_id(url)
49
+ url[%r{http://www\.libertytimes\.com\.tw/(.*)\.htm},1]
50
+ end
51
+ end
@@ -0,0 +1,53 @@
1
+ class TaiwaneseNewsParser::Parser::NowNews < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'nownews.com'
4
+ end
5
+
6
+ def self.names
7
+ %w{NowNews 今日新聞}
8
+ end
9
+
10
+ def doc
11
+ @raw = open(url).read
12
+ @doc = Nokogiri::HTML(@raw)
13
+ end
14
+
15
+ #url = 'http://www.nownews.com/n/2014/03/21/1159861'
16
+ def parse
17
+ @article[:title] = doc.css('[itemprop=headline]').text
18
+
19
+ @article[:company_name] = self.class.names.first
20
+
21
+ @article[:content] = doc.css('[itemprop=articleBody]>p').text
22
+
23
+ @article[:reporter_name] = parse_reporter_name()
24
+
25
+ t = doc.css('#reporter_info p').text.match(/(\d*)年\s*(\d+)月\s*(\d+)日\D*(\d+):(\d+)/)
26
+ @article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
27
+
28
+ clean_up
29
+
30
+ @article
31
+ end
32
+
33
+ def parse_reporter_name
34
+ text = doc.css('[itemprop=articleBody]').text
35
+ if match = text.match(%r{記者(.+?)[//╱/]})
36
+ reporter_name = match[1]
37
+ end
38
+ reporter_name
39
+ end
40
+
41
+ def clean_url
42
+ cleaner = TaiwaneseNewsParser::UrlCleaner.new()
43
+ @article[:url] = cleaner.clean(@article[:url])
44
+ end
45
+
46
+ def self.parse_url_id(url)
47
+ url[%r{/\d+/\d+/\d+/(\d+)},1]
48
+ end
49
+
50
+ def reproduced?
51
+ false
52
+ end
53
+ end
@@ -0,0 +1,46 @@
1
+ class TaiwaneseNewsParser::Parser::Tvbs < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'tvbs.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ ['TVBS']
8
+ end
9
+
10
+ def self.applicable?(url)
11
+ url.match(%r{tvbs\.com\.tw/entry})
12
+ end
13
+
14
+ def doc
15
+ @raw = open(url).read
16
+ @doc = Nokogiri::HTML(@raw)
17
+ end
18
+
19
+ #url = 'http://news.tvbs.com.tw/entry/519673'
20
+ def parse
21
+ @article[:title] = doc.at_css('article h1').text
22
+ @article[:company_name] = parse_company_name
23
+ @article[:content] = doc.css('article .content').text
24
+
25
+ time = doc.at_css('article .meta-data .dateline').text[%r{時間:\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
26
+ @article[:published_at] = Time.parse("#{time}:00")
27
+
28
+ @article[:reporter_name] = parse_reporter_name()
29
+
30
+ clean_up
31
+
32
+ @article
33
+ end
34
+
35
+ def parse_reporter_name
36
+ doc.at_css('article .meta-data .reporter').text[%r{記者:(.+)},1]
37
+ end
38
+
39
+ def parse_company_name
40
+ self.class.names.first
41
+ end
42
+
43
+ def self.parse_url_id(url)
44
+ url[%r{/entry/(\d+)},1]
45
+ end
46
+ end
@@ -0,0 +1,43 @@
1
+ class TaiwaneseNewsParser::Parser::Udn < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'udn.com'
4
+ end
5
+
6
+ def self.names
7
+ %{聯合報 聯合晚報}
8
+ end
9
+
10
+ #url = 'http://udn.com/NEWS/NATIONAL/NATS5/7807573.shtml'
11
+ def parse
12
+ @article[:title] = doc.at_css('#story_title').text
13
+ @article[:content] = doc.at_css('#story').text
14
+
15
+ #a.web_published_at = Time.parse(doc.at_css('#story_update').text)
16
+
17
+ @article[:company_name] = parse_company_name
18
+ @article[:reporter_name] = parse_reporter_name
19
+
20
+ @article[:published_at] = Time.parse(doc.at_css('#story_update').text)
21
+
22
+ clean_up
23
+
24
+ @article
25
+ end
26
+
27
+ def parse_company_name
28
+ get_company_name_and_reporter_name.match(%r{^(.*?)[//╱]})[1]
29
+ end
30
+ def parse_reporter_name
31
+ get_company_name_and_reporter_name[%r{[//╱]記者(.*)[//╱]},1]
32
+ end
33
+
34
+ def self.parse_url_id(url)
35
+ url[%r{\w+/\w+/(\d+)},1]
36
+ end
37
+
38
+ private
39
+
40
+ def get_company_name_and_reporter_name
41
+ doc.at_css('#story_author').text[%r{【(.*)】},1]
42
+ end
43
+ end