taiwanese_news_parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/Gemfile +4 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +23 -0
  6. data/Rakefile +4 -0
  7. data/g0v.json +37 -0
  8. data/lib/taiwanese_news_parser/parser/apple_daily.rb +69 -0
  9. data/lib/taiwanese_news_parser/parser/china_times.rb +76 -0
  10. data/lib/taiwanese_news_parser/parser/cna.rb +59 -0
  11. data/lib/taiwanese_news_parser/parser/cts.rb +52 -0
  12. data/lib/taiwanese_news_parser/parser/ettoday.rb +53 -0
  13. data/lib/taiwanese_news_parser/parser/liberty_times.rb +66 -0
  14. data/lib/taiwanese_news_parser/parser/liberty_times_big5.rb +51 -0
  15. data/lib/taiwanese_news_parser/parser/now_news.rb +53 -0
  16. data/lib/taiwanese_news_parser/parser/tvbs.rb +46 -0
  17. data/lib/taiwanese_news_parser/parser/udn.rb +43 -0
  18. data/lib/taiwanese_news_parser/parser.rb +57 -0
  19. data/lib/taiwanese_news_parser/url_cleaner.rb +19 -0
  20. data/lib/taiwanese_news_parser/version.rb +3 -0
  21. data/lib/taiwanese_news_parser.rb +15 -0
  22. data/spec/spec_helper.rb +9 -0
  23. data/spec/taiwanese_news_parser/parser/apple_daily_s1.html +484 -0
  24. data/spec/taiwanese_news_parser/parser/apple_daily_s2.html +333 -0
  25. data/spec/taiwanese_news_parser/parser/apple_daily_s3.html +334 -0
  26. data/spec/taiwanese_news_parser/parser/apple_daily_spec.rb +57 -0
  27. data/spec/taiwanese_news_parser/parser/china_times_s1.html +513 -0
  28. data/spec/taiwanese_news_parser/parser/china_times_s2.html +538 -0
  29. data/spec/taiwanese_news_parser/parser/china_times_s3.html +893 -0
  30. data/spec/taiwanese_news_parser/parser/china_times_s4.html +1045 -0
  31. data/spec/taiwanese_news_parser/parser/china_times_spec.rb +63 -0
  32. data/spec/taiwanese_news_parser/parser/cna_s1.html +1616 -0
  33. data/spec/taiwanese_news_parser/parser/cna_spec.rb +33 -0
  34. data/spec/taiwanese_news_parser/parser/cts_s1.html +672 -0
  35. data/spec/taiwanese_news_parser/parser/cts_s2.html +672 -0
  36. data/spec/taiwanese_news_parser/parser/cts_spec.rb +36 -0
  37. data/spec/taiwanese_news_parser/parser/ettoday_s1.html +1817 -0
  38. data/spec/taiwanese_news_parser/parser/ettoday_s2.html +1822 -0
  39. data/spec/taiwanese_news_parser/parser/ettoday_spec.rb +35 -0
  40. data/spec/taiwanese_news_parser/parser/liberty_times_big5_s1.html +213 -0
  41. data/spec/taiwanese_news_parser/parser/liberty_times_big5_spec.rb +31 -0
  42. data/spec/taiwanese_news_parser/parser/liberty_times_s1.html +145 -0
  43. data/spec/taiwanese_news_parser/parser/liberty_times_spec.rb +29 -0
  44. data/spec/taiwanese_news_parser/parser/now_news_s1.html +968 -0
  45. data/spec/taiwanese_news_parser/parser/now_news_s2.html +986 -0
  46. data/spec/taiwanese_news_parser/parser/now_news_spec.rb +31 -0
  47. data/spec/taiwanese_news_parser/parser/tvbs_s1.html +734 -0
  48. data/spec/taiwanese_news_parser/parser/tvbs_s2.html +739 -0
  49. data/spec/taiwanese_news_parser/parser/tvbs_spec.rb +36 -0
  50. data/spec/taiwanese_news_parser/parser/udn_s1.html +1678 -0
  51. data/spec/taiwanese_news_parser/parser/udn_spec.rb +42 -0
  52. data/taiwanese_news_parser.gemspec +30 -0
  53. metadata +237 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 067e4fb14a386429fc75be6ad0ffd3169677d9a7
4
+ data.tar.gz: b3c9af0188c2df29573c725c09919530a1a8099c
5
+ SHA512:
6
+ metadata.gz: 5d6fde19818bfdf3a408dd985fec21b07dcf7a117a321a35e788bba3861cc8179f50203dea756cc8e841a1d3b3f7f1ece849f15a7084af987af9d63de54e1bf4
7
+ data.tar.gz: f32b846ff9bc138b611bb330bea80612f129ce62951f06d36a7fb953fa383f029bdbcea7007ff134eb6f613358cb773c84ca2e39d883e08df3dc445547e6e786
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in taiwanese_news_parser.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 lulalala
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,23 @@
1
+ # TaiwaneseNewsParser
2
+
3
+ 台灣各新聞網站新聞解析器
4
+
5
+ ## Installation
6
+
7
+ gem 'taiwanese_news_parser'
8
+
9
+ ## Usage
10
+
11
+ TaiwaneseNewsParser.parse(url)
12
+
13
+ 會回傳新聞資訊的 hash
14
+
15
+ ## Contributing
16
+
17
+ 想要協助的朋友可以幫忙為其他新聞網站寫解析器。實作細節請參考個別解析器以及[wiki](https://github.com/lulalala/taiwanese_news_parser/wiki)。
18
+
19
+ 1. Fork it
20
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
21
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
22
+ 4. Push to the branch (`git push origin my-new-feature`)
23
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new
data/g0v.json ADDED
@@ -0,0 +1,37 @@
1
+ {
2
+ "author": "lulalala",
3
+ "contributors": [],
4
+ "status": "alpha",
5
+ "name": "taiwanese_news_parser",
6
+ "name_zh": "台灣新聞網站解析器",
7
+ "description": "Parse Taiwanese based news agency website news articles",
8
+ "description_zh": "台灣新聞媒體網站的新聞解析庫",
9
+ "homepage": "https://github.com/lulalala/taiwanese_news_parser",
10
+ "document": "http://hack.g0v.tw/taiwanese_news_parser",
11
+ "repository": {
12
+ "type": "git",
13
+ "url": "https://github.com/lulalala/taiwanese_news_parser"
14
+ },
15
+ "licenses": [
16
+ {
17
+ "type": "MIT"
18
+ }
19
+ ],
20
+ "keywords": [
21
+ "ruby"
22
+ ],
23
+ "audience": [
24
+ "public"
25
+ ],
26
+ "products": [
27
+ "library",
28
+ "api"
29
+ ],
30
+ "projects": [
31
+ "taiwanese_news_parser"
32
+ ],
33
+ "thumbnail": "",
34
+ "needs": [
35
+ "programmer"
36
+ ]
37
+ }
@@ -0,0 +1,69 @@
1
+ class TaiwaneseNewsParser::Parser::AppleDaily < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'appledaily.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ %w{蘋果日報}
8
+ end
9
+
10
+ def doc
11
+ @raw = open(url).read
12
+ @doc = Nokogiri::HTML(@raw)
13
+ end
14
+
15
+ #url = 'http://www.appledaily.com.tw/appledaily/article/headline/20130414/34951658'
16
+ def parse
17
+ @article[:title] = doc.at_css('#h1').text
18
+
19
+ @article[:company_name] = parse_company_name
20
+
21
+ @article[:content] = doc.css('.articulum').css('p,h2').text
22
+
23
+ @article[:reporter_name] = parse_reporter_name()
24
+
25
+ @article[:published_at] = self.class.parse_time(doc.css('.gggs time').text)
26
+
27
+ clean_up
28
+
29
+ @article
30
+ end
31
+
32
+ def parse_company_name
33
+ '蘋果日報'
34
+ end
35
+
36
+ def parse_reporter_name
37
+ text = doc.css('.articulum').css('p,h2').text.strip
38
+ if match = text.match(%r{◎記者(.+)$})
39
+ return reporter_name = match[1]
40
+ elsif match = text.match(%r{【(?:記者)?(.+?)[//╱]})
41
+ reporter_name = match[1]
42
+ end
43
+ reporter_name
44
+ end
45
+
46
+ def clean_url
47
+ @article[:url].gsub!(%r{/([^/]*)$},'')
48
+ end
49
+
50
+ def self.parse_url_id(url)
51
+ # removes trailing slash
52
+ url[%r{http://www.appledaily\.com\.tw/\w+/article/\w+/((?:\d+/)+)},1][0..-2]
53
+ end
54
+
55
+ def self.parse_time(raw_time)
56
+ valid_formats = ['%Y年%m月%d日%H:%M', '%Y年%m月%d日']
57
+
58
+ date = nil
59
+ valid_formats.each do |format|
60
+ begin
61
+ date = DateTime.strptime(raw_time, format)
62
+ rescue
63
+ end
64
+ break if !date.nil?
65
+ end
66
+
67
+ return date
68
+ end
69
+ end
@@ -0,0 +1,76 @@
1
+ class TaiwaneseNewsParser::Parser::ChinaTimes < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'chinatimes.com'
4
+ end
5
+
6
+ def self.names
7
+ %w{中國時報 中時電子報 工商時報 旺報 時報週刊 中天 中視 中廣}
8
+ end
9
+
10
+ def doc
11
+ @raw = open(url).read
12
+ @doc = Nokogiri::HTML(@raw)
13
+ end
14
+
15
+ #url = 'http://news.chinatimes.com/mainland/11050505/112013041400325.html'
16
+ #url = 'http://www.chinatimes.com/realtimenews/%E6%AD%BB%E4%BA%A1%E9%9B%B2%E9%9C%84%E9%A3%9B%E8%BB%8A-%E7%BE%8E%E5%A9%A6%E5%A2%9C%E8%90%BD%E8%BA%AB%E4%BA%A1-20130720002354-260408'
17
+ def parse
18
+ @article[:title] = doc.at_css('.page_container header h1').text
19
+
20
+ @article[:company_name] = parse_company_name
21
+
22
+ @article[:content] = doc.css('.page_container article>p').text
23
+
24
+ #@article[:web_published_at] = Time.parse(doc.at_css('#story_update').text)
25
+
26
+ @article[:reporter_name] = parse_reporter_name()
27
+
28
+ t = doc.css('.reporter time').text.match(/(\d*)年(\d*)月(\d*)日 (\d*):(\d*)/)
29
+ @article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
30
+
31
+ clean_up
32
+
33
+ @article
34
+ end
35
+
36
+ def parse_reporter_name
37
+ el = doc.at_css('.reporter a[rel=author]')
38
+ return el.text if el
39
+
40
+ text = doc.css('.reporter>text()').text
41
+ if match = text.match(%r{記者(.+?)[//╱/]})
42
+ reporter_name = match[1]
43
+ elsif match = text.match(%r{【(.+?)[//╱/]})
44
+ reporter_name = match[1]
45
+ else
46
+ reporter_name = text
47
+ end
48
+ reporter_name
49
+ end
50
+
51
+ def parse_company_name
52
+ n = doc.at_css('.reporter>a').text
53
+ if n == '時週精選'
54
+ n = '時報週刊'
55
+ elsif n == '新聞速報'
56
+ n = '中時電子報'
57
+ end
58
+ n
59
+ end
60
+
61
+ def clean_url
62
+ cleaner = TaiwaneseNewsParser::UrlCleaner.new('id')
63
+ @article[:url] = cleaner.clean(@article[:url])
64
+ end
65
+
66
+ def self.parse_url_id(url)
67
+ url_id = url[%r{http://news\.chinatimes\.com/\w+/(\d+/\d+)},1]
68
+ if url_id.nil?
69
+ url_id = url[%r{[^-]*+[^-]*+-(\d+)-\d+},1]
70
+ end
71
+ if url_id.nil?
72
+ url_id = url[%r{chinatimes\.com/(.+)},1]
73
+ end
74
+ url_id
75
+ end
76
+ end
@@ -0,0 +1,59 @@
1
+ class TaiwaneseNewsParser::Parser::Cna < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'cna.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ %{中央社}
8
+ end
9
+
10
+ def doc
11
+ @raw = open(url).read
12
+ @doc = Nokogiri::HTML(@raw)
13
+ end
14
+
15
+ #url = 'http://www.cna.com.tw/News/aSaM/201304120296-1.aspx'
16
+ def parse
17
+ @article[:title] = doc.at_css('.news_content h1').text
18
+
19
+ @article[:company_name] = '中央社'
20
+
21
+ @article[:content] = doc.css('.news_content .box_2').text
22
+
23
+ @article[:reporter_name] = parse_reporter_name()
24
+
25
+ match = doc.css('.news_content .box_2').text.strip.match( /(\d{3})(\d{2})(\d{2})/ )
26
+ date = []
27
+ date[0] = match[1].to_i + 1911
28
+ date[1] = match[2]
29
+ date[2] = match[3]
30
+ date_string = date.join('/') + ' ' + doc.css('.date').text
31
+ @article[:published_at] = Time.parse(date_string)
32
+
33
+ clean_up
34
+
35
+ @article
36
+ end
37
+
38
+ def parse_reporter_name
39
+ text = doc.css('.news_content .box_2').text
40
+ text = text[/(中央社(.*?)\d{1,2}日/,1]
41
+ cities = %w{台北 新北 台中 台南 高雄 基隆 新竹 嘉義 桃園 新竹 苗栗 彰化 南投 雲林 嘉義 屏東 宜蘭 花蓮 台東 澎湖 金門 連江}
42
+ cities.find do |city|
43
+ text.gsub!(/#{city}(?:縣市)?$/,'')
44
+ end
45
+ # TODO proper location name removal
46
+ if match = text.match(%r{記者(.+)})
47
+ reporter_name = match[1]
48
+ end
49
+ reporter_name
50
+ end
51
+
52
+ def reproduced?
53
+ false
54
+ end
55
+
56
+ def self.parse_url_id(url)
57
+ url[%r{/(\d+)(?:\-\d)?\.},1]
58
+ end
59
+ end
@@ -0,0 +1,52 @@
1
+ class TaiwaneseNewsParser::Parser::Cts < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'cts.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ ['華視']
8
+ end
9
+
10
+ def self.applicable?(url)
11
+ url.match(%r{cts\.com\.tw/})
12
+ end
13
+
14
+ def doc
15
+ @raw = open(url).read
16
+ @doc = Nokogiri::HTML(@raw)
17
+ end
18
+
19
+ #url = 'http://news.cts.com.tw/cts/politics/201403/201403191393958.html'
20
+ def parse
21
+ @article[:title] = doc.at_css('table h1').text
22
+ @article[:company_name] = parse_company_name
23
+ @article[:content] = doc.css('#ctscontent p').text
24
+
25
+ time = doc.at_css('td.style14 span.info').text[%r{^\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
26
+ @article[:published_at] = Time.parse("#{time}:00")
27
+
28
+ @article[:reporter_name] = parse_reporter_name()
29
+
30
+ clean_up
31
+
32
+ @article
33
+ end
34
+
35
+ def parse_reporter_name
36
+ text = doc.at_css('td.style14 span.info').text
37
+ text.gsub!(%r{^\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}},'')
38
+ text.gsub!(%r{地區.+$},'')
39
+ if text.include?('綜合報導')
40
+ return nil
41
+ end
42
+ text[%r{(.+) 報導},1]
43
+ end
44
+
45
+ def parse_company_name
46
+ doc.at_css('table table div[align="right"] a img').attr(:alt)
47
+ end
48
+
49
+ def self.parse_url_id(url)
50
+ url[%r{/cts/.+/\d+/(\d+)\.html},1]
51
+ end
52
+ end
@@ -0,0 +1,53 @@
1
+ class TaiwaneseNewsParser::Parser::Ettoday < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'ettoday.net'
4
+ end
5
+
6
+ def self.names
7
+ %w{東森}
8
+ end
9
+
10
+ def doc
11
+ @raw = open(url).read
12
+ @doc = Nokogiri::HTML(@raw)
13
+ end
14
+
15
+ #url = 'http://www.ettoday.net/news/20130128/158005.htm'
16
+ def parse
17
+ @article[:title] = doc.css('[itemprop=headline]').text
18
+
19
+ @article[:company_name] = '東森'
20
+
21
+ @article[:content] = doc.css('[itemprop=articleBody]>p').text
22
+
23
+ @article[:reporter_name] = parse_reporter_name()
24
+
25
+ t = doc.css('.news-time').text.match(/(\d*)年(\d*)月(\d*)日 (\d*):(\d*)/)
26
+ @article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
27
+
28
+ clean_up
29
+
30
+ @article
31
+ end
32
+
33
+ def parse_reporter_name
34
+ text = doc.css('[itemprop=articleBody]').text
35
+ if match = text.match(%r{記者(.+?)[//╱/]})
36
+ reporter_name = match[1]
37
+ end
38
+ reporter_name
39
+ end
40
+
41
+ def clean_url
42
+ cleaner = TaiwaneseNewsParser::UrlCleaner.new()
43
+ @article[:url] = cleaner.clean(@article[:url])
44
+ end
45
+
46
+ def self.parse_url_id(url)
47
+ url[%r{http://www\.ettoday\.net/\w+/(\d+/\d+)},1]
48
+ end
49
+
50
+ def reproduced?
51
+ false
52
+ end
53
+ end
@@ -0,0 +1,66 @@
1
+ class TaiwaneseNewsParser::Parser::LibertyTimes < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'libertytimes.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ %{自由時報}
8
+ end
9
+
10
+ def self.applicable?(url)
11
+ url.match(%r{libertytimes\.com\.tw/liveNews/news\.php})
12
+ end
13
+
14
+ def doc
15
+ @raw = open(url).read
16
+ @doc = Nokogiri::HTML(@raw)
17
+ end
18
+
19
+ #url = 'http://www.libertytimes.com.tw/2013/new/apr/13/today-sp2.htm'
20
+ def parse
21
+ # new layout uses utf-8
22
+ @article[:title] = doc.at_css('#newsti text()').text
23
+ @article[:company_name] = parse_company_name
24
+ @article[:content] = doc.css('#newsc.news_content').text
25
+
26
+ time = doc.at_css('.conttime').text[%r{\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
27
+ if time.nil?
28
+ match = doc.at_css('.conttime').text.match(%r{(\d{2}):(\d{2})})
29
+ now = Time.now
30
+ today = Date.today
31
+ @article[:published_at] = Time.new(today.year, today.month, today.day, match[1].to_i, match[2].to_i)
32
+ else
33
+ @article[:published_at] = Time.parse("#{time}:00")
34
+ end
35
+
36
+ @article[:reporter_name] = parse_reporter_name()
37
+
38
+ clean_up
39
+
40
+ @article
41
+ end
42
+
43
+ def parse_reporter_name
44
+ if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
45
+ reporter_name = match[1][%r{記者(.+)},1]
46
+ elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
47
+ reporter_name = match[1]
48
+ elsif match = @article[:content].match(%r{(文/(.*?))})
49
+ reporter_name = match[1]
50
+ end
51
+ reporter_name
52
+ end
53
+
54
+ def parse_company_name
55
+ '自由時報'
56
+ end
57
+
58
+ def clean_url
59
+ cleaner = TaiwaneseNewsParser::UrlCleaner.new('no')
60
+ @article[:url] = cleaner.clean(@article[:url])
61
+ end
62
+
63
+ def self.parse_url_id(url)
64
+ url[%r{news\.php\?no=(\d+)},1]
65
+ end
66
+ end
@@ -0,0 +1,51 @@
1
+ class TaiwaneseNewsParser::Parser::LibertyTimesBig5 < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'libertytimes.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ %{自由時報}
8
+ end
9
+
10
+ def self.applicable?(url)
11
+ url.match(%r{libertytimes\.com\.tw/\d+/\w+/\w+/\d+/.+\.htm})
12
+ end
13
+
14
+ #url = 'http://www.libertytimes.com.tw/2013/new/apr/13/today-sp2.htm'
15
+ def parse
16
+ @article[:title] = doc.at_css('#newtitle').text
17
+ @article[:company_name] = parse_company_name
18
+ @article[:content] = doc.css('#newsContent>span:not(#newtitle)>p:not(.picture)').text
19
+
20
+ @article[:reporter_name] = parse_reporter_name()
21
+ @article[:published_at] = Time.parse(doc.at_css('#date').text)
22
+
23
+ clean_up
24
+
25
+ @article
26
+ end
27
+
28
+ def parse_reporter_name
29
+ if match = @article[:content].match(%r{〔(.*?)[//╱](.*?)〕})
30
+ reporter_name = match[1][%r{記者(.+)},1]
31
+ elsif match = @article[:content].match(%r{記者(.+?)[//╱]})
32
+ reporter_name = match[1]
33
+ elsif match = @article[:content].match(%r{(文/(.*?))})
34
+ reporter_name = match[1]
35
+ end
36
+ reporter_name
37
+ end
38
+
39
+ def parse_company_name
40
+ '自由時報'
41
+ end
42
+
43
+ def clean_url
44
+ cleaner = TaiwaneseNewsParser::UrlCleaner.new('')
45
+ @article[:url] = cleaner.clean(@article[:url])
46
+ end
47
+
48
+ def self.parse_url_id(url)
49
+ url[%r{http://www\.libertytimes\.com\.tw/(.*)\.htm},1]
50
+ end
51
+ end
@@ -0,0 +1,53 @@
1
+ class TaiwaneseNewsParser::Parser::NowNews < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'nownews.com'
4
+ end
5
+
6
+ def self.names
7
+ %w{NowNews 今日新聞}
8
+ end
9
+
10
+ def doc
11
+ @raw = open(url).read
12
+ @doc = Nokogiri::HTML(@raw)
13
+ end
14
+
15
+ #url = 'http://www.nownews.com/n/2014/03/21/1159861'
16
+ def parse
17
+ @article[:title] = doc.css('[itemprop=headline]').text
18
+
19
+ @article[:company_name] = self.class.names.first
20
+
21
+ @article[:content] = doc.css('[itemprop=articleBody]>p').text
22
+
23
+ @article[:reporter_name] = parse_reporter_name()
24
+
25
+ t = doc.css('#reporter_info p').text.match(/(\d*)年\s*(\d+)月\s*(\d+)日\D*(\d+):(\d+)/)
26
+ @article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
27
+
28
+ clean_up
29
+
30
+ @article
31
+ end
32
+
33
+ def parse_reporter_name
34
+ text = doc.css('[itemprop=articleBody]').text
35
+ if match = text.match(%r{記者(.+?)[//╱/]})
36
+ reporter_name = match[1]
37
+ end
38
+ reporter_name
39
+ end
40
+
41
+ def clean_url
42
+ cleaner = TaiwaneseNewsParser::UrlCleaner.new()
43
+ @article[:url] = cleaner.clean(@article[:url])
44
+ end
45
+
46
+ def self.parse_url_id(url)
47
+ url[%r{/\d+/\d+/\d+/(\d+)},1]
48
+ end
49
+
50
+ def reproduced?
51
+ false
52
+ end
53
+ end
@@ -0,0 +1,46 @@
1
+ class TaiwaneseNewsParser::Parser::Tvbs < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'tvbs.com.tw'
4
+ end
5
+
6
+ def self.names
7
+ ['TVBS']
8
+ end
9
+
10
+ def self.applicable?(url)
11
+ url.match(%r{tvbs\.com\.tw/entry})
12
+ end
13
+
14
+ def doc
15
+ @raw = open(url).read
16
+ @doc = Nokogiri::HTML(@raw)
17
+ end
18
+
19
+ #url = 'http://news.tvbs.com.tw/entry/519673'
20
+ def parse
21
+ @article[:title] = doc.at_css('article h1').text
22
+ @article[:company_name] = parse_company_name
23
+ @article[:content] = doc.css('article .content').text
24
+
25
+ time = doc.at_css('article .meta-data .dateline').text[%r{時間:\d{4}/\d{1,2}/\d{1,2} \d{2}:\d{2}}]
26
+ @article[:published_at] = Time.parse("#{time}:00")
27
+
28
+ @article[:reporter_name] = parse_reporter_name()
29
+
30
+ clean_up
31
+
32
+ @article
33
+ end
34
+
35
+ def parse_reporter_name
36
+ doc.at_css('article .meta-data .reporter').text[%r{記者:(.+)},1]
37
+ end
38
+
39
+ def parse_company_name
40
+ self.class.names.first
41
+ end
42
+
43
+ def self.parse_url_id(url)
44
+ url[%r{/entry/(\d+)},1]
45
+ end
46
+ end
@@ -0,0 +1,43 @@
1
+ class TaiwaneseNewsParser::Parser::Udn < TaiwaneseNewsParser::Parser
2
+ def self.domain
3
+ 'udn.com'
4
+ end
5
+
6
+ def self.names
7
+ %{聯合報 聯合晚報}
8
+ end
9
+
10
+ #url = 'http://udn.com/NEWS/NATIONAL/NATS5/7807573.shtml'
11
+ def parse
12
+ @article[:title] = doc.at_css('#story_title').text
13
+ @article[:content] = doc.at_css('#story').text
14
+
15
+ #a.web_published_at = Time.parse(doc.at_css('#story_update').text)
16
+
17
+ @article[:company_name] = parse_company_name
18
+ @article[:reporter_name] = parse_reporter_name
19
+
20
+ @article[:published_at] = Time.parse(doc.at_css('#story_update').text)
21
+
22
+ clean_up
23
+
24
+ @article
25
+ end
26
+
27
+ def parse_company_name
28
+ get_company_name_and_reporter_name.match(%r{^(.*?)[//╱]})[1]
29
+ end
30
+ def parse_reporter_name
31
+ get_company_name_and_reporter_name[%r{[//╱]記者(.*)[//╱]},1]
32
+ end
33
+
34
+ def self.parse_url_id(url)
35
+ url[%r{\w+/\w+/(\d+)},1]
36
+ end
37
+
38
+ private
39
+
40
+ def get_company_name_and_reporter_name
41
+ doc.at_css('#story_author').text[%r{【(.*)】},1]
42
+ end
43
+ end