hongkong-news-scrapers 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c55a3d970d1d5352121e572030ebca5de8ba0700
4
- data.tar.gz: fb6f758b612e98e23de79f03285dd55dbc760fd7
3
+ metadata.gz: 325eddb50876330eefa2711493ca8070705319ce
4
+ data.tar.gz: ef032d1cda4305049bdfacbe2b02db85bc1127c8
5
5
  SHA512:
6
- metadata.gz: 609c19b1a287c3f339d1902692a1f883862ddda4474d3c1b49d933a6e82da298998291032a72c6700bddfe73492b189b1c29edd6dc8426f442bb1437680893f3
7
- data.tar.gz: 02e67ae4872db3dd33bce4d5f09dbcf0a34dd3ea5e765f2a9343d4615bb22118a84602000c2680f34996ae48a58fa1b763220547d7ef4c7b0024c9294eb61946
6
+ metadata.gz: 36f874ccb99d3ef14c0d2cf2db420369c16117c43f3d4753e0af4d0082a948dbbceec7776e5f8523ac7598e701fa54d2334a606ae506a73fadecbfa5359a512c
7
+ data.tar.gz: 32e782416317efbbb91e7947957d3d3683cec51d23db94f387fea7554765c76fb5cf708823b58320e3ec19ed607b1efb529ef8810d2669c710e191b797e45d1c
@@ -26,4 +26,5 @@ Gem::Specification.new do |spec|
26
26
 
27
27
  spec.add_dependency 'capybara'
28
28
  spec.add_dependency 'poltergeist'
29
+ spec.add_dependency 'nokogiri'
29
30
  end
@@ -1,6 +1,6 @@
1
1
  module Hongkong
2
2
  module News
3
- class Document < Struct.new(:id, :source, :title, :url, :html, :content, :screenshot_data)
3
+ class Document < Struct.new(:id, :source, :title, :url, :html, :content, :screenshot_data, :image_url)
4
4
  end
5
5
  end
6
6
  end
@@ -5,6 +5,10 @@ module Hongkong
5
5
  module Scrapers
6
6
  class AppleDailyScraper
7
7
  include PhantomScraper
8
+
9
+ def name
10
+ "appledaily"
11
+ end
8
12
 
9
13
  # Extract all news links from Apple Daily
10
14
  def news_links
@@ -15,7 +19,7 @@ module Hongkong
15
19
  link.title = option.text
16
20
  link.url = option["value"]
17
21
  link
18
- end
22
+ end.reject { |l| l.url.nil? }
19
23
  end
20
24
 
21
25
  # Extract article from page from Apple Daily
@@ -23,13 +27,13 @@ module Hongkong
23
27
  visit url
24
28
 
25
29
  document = Document.new
26
- document.source = 'appledaily'
27
- document.title = first("#articleContent h1").text.strip
30
+ document.source = name
31
+ document.title = doc.search("#articleContent h1").text.strip
28
32
  document.url = url
29
33
  document.html = html
30
34
  document.content = page.evaluate_script("HongKongNews.getInnerText('#masterContent')")
31
35
  document.screenshot_data = screenshot_data
32
-
36
+ document.image_url = doc.search("//meta[@property='og:image']/@content").first.text rescue nil
33
37
  document
34
38
  end
35
39
  end
@@ -9,6 +9,10 @@ module Hongkong
9
9
 
10
10
  LIST_URL = "http://news.mingpao.com/pns/%E6%96%B0%E8%81%9E%E7%B8%BD%E8%A6%BD/web_tc/archive/latest"
11
11
 
12
+ def name
13
+ "mingpao"
14
+ end
15
+
12
16
  # Extract all news links from Mingpao
13
17
  def news_links
14
18
  visit LIST_URL
@@ -29,13 +33,13 @@ module Hongkong
29
33
  first("article p")
30
34
 
31
35
  document = Document.new
32
- document.source = 'mingpao'
33
- document.title = first("h1").text
36
+ document.source = name
37
+ document.title = doc.search("h1").text
34
38
  document.url = url
35
39
  document.html = html
36
40
  document.content = page.evaluate_script("HongKongNews.getInnerText('article')")
37
41
  document.screenshot_data = screenshot_data
38
-
42
+ document.image_url = doc.search("//meta[@property='og:image']/@content").first.text rescue nil
39
43
  document
40
44
  end
41
45
  end
@@ -0,0 +1,51 @@
1
+ require_relative './phantom_scraper'
2
+ require 'uri'
3
+
4
+ module Hongkong
5
+ module News
6
+ module Scrapers
7
+ class OrientalDailyScraper
8
+ include PhantomScraper
9
+
10
+ LIST_URL = "http://orientaldaily.on.cc/"
11
+
12
+ def name
13
+ "orientaldaily"
14
+ end
15
+
16
+ # Extract all news links
17
+ def news_links
18
+ visit LIST_URL
19
+
20
+ all("#articleListSELECT option").collect do |option|
21
+ link = Link.new
22
+ link.title = option.text
23
+ link.url = URI::join(LIST_URL, option["value"]).to_s
24
+ link
25
+ end.reject { |l| l.url.to_s.end_with?("#") }
26
+ end
27
+
28
+ # Extract article from page
29
+ def news(url)
30
+ visit url
31
+
32
+ # wait for content to be loaded
33
+ first("#contentCTN-right")
34
+
35
+ document = Document.new
36
+ document.source = name
37
+ document.title = doc.search("h1").text
38
+ document.url = url
39
+ document.html = html
40
+ document.content = page.evaluate_script("HongKongNews.getInnerText('#contentCTN-top')") + "\n" + page.evaluate_script("HongKongNews.getInnerText('#contentCTN-right')")
41
+ document.screenshot_data = screenshot_data
42
+
43
+ image = doc.search("#contentCTN .photo img").first
44
+ document.image_url = URI::join(url, image["src"]).to_s if image
45
+
46
+ document
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -1,4 +1,5 @@
1
1
  require 'capybara/poltergeist'
2
+ require 'nokogiri'
2
3
  require 'tempfile'
3
4
 
4
5
  module Hongkong
@@ -45,6 +46,14 @@ module Hongkong
45
46
  def html
46
47
  page.html
47
48
  end
49
+
50
+ # Get a Nokogiri Document for current page
51
+ def doc
52
+ unless @doc
53
+ @doc = Nokogiri::HTML(html)
54
+ end
55
+ @doc
56
+ end
48
57
  end
49
58
  end
50
59
  end
@@ -1,7 +1,7 @@
1
1
  module Hongkong
2
2
  module News
3
3
  module Scrapers
4
- VERSION = "0.3.1"
4
+ VERSION = "0.4.0"
5
5
  end
6
6
  end
7
7
  end
@@ -4,4 +4,5 @@ require "hongkong/news/models/document"
4
4
  require "hongkong/news/models/link"
5
5
 
6
6
  require "hongkong/news/scrapers/apple_daily_scraper"
7
- require "hongkong/news/scrapers/mingpao_scraper"
7
+ require "hongkong/news/scrapers/mingpao_scraper"
8
+ require "hongkong/news/scrapers/oriental_daily_scraper"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hongkong-news-scrapers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Chong
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: nokogiri
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  description: Scrape Hong Kong news for good.
98
112
  email:
99
113
  - francis@ignition.hk
@@ -118,6 +132,7 @@ files:
118
132
  - lib/hongkong/news/scrapers.rb
119
133
  - lib/hongkong/news/scrapers/apple_daily_scraper.rb
120
134
  - lib/hongkong/news/scrapers/mingpao_scraper.rb
135
+ - lib/hongkong/news/scrapers/oriental_daily_scraper.rb
121
136
  - lib/hongkong/news/scrapers/phantom_scraper.rb
122
137
  - lib/hongkong/news/scrapers/phantom_scraper_extension.js
123
138
  - lib/hongkong/news/scrapers/version.rb