hongkong-news-scrapers 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/hongkong-news-scrapers.gemspec +1 -0
- data/lib/hongkong/news/models/document.rb +1 -1
- data/lib/hongkong/news/scrapers/apple_daily_scraper.rb +8 -4
- data/lib/hongkong/news/scrapers/mingpao_scraper.rb +7 -3
- data/lib/hongkong/news/scrapers/oriental_daily_scraper.rb +51 -0
- data/lib/hongkong/news/scrapers/phantom_scraper.rb +9 -0
- data/lib/hongkong/news/scrapers/version.rb +1 -1
- data/lib/hongkong/news/scrapers.rb +2 -1
- metadata +16 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 325eddb50876330eefa2711493ca8070705319ce
|
4
|
+
data.tar.gz: ef032d1cda4305049bdfacbe2b02db85bc1127c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 36f874ccb99d3ef14c0d2cf2db420369c16117c43f3d4753e0af4d0082a948dbbceec7776e5f8523ac7598e701fa54d2334a606ae506a73fadecbfa5359a512c
|
7
|
+
data.tar.gz: 32e782416317efbbb91e7947957d3d3683cec51d23db94f387fea7554765c76fb5cf708823b58320e3ec19ed607b1efb529ef8810d2669c710e191b797e45d1c
|
@@ -5,6 +5,10 @@ module Hongkong
|
|
5
5
|
module Scrapers
|
6
6
|
class AppleDailyScraper
|
7
7
|
include PhantomScraper
|
8
|
+
|
9
|
+
def name
|
10
|
+
"appledaily"
|
11
|
+
end
|
8
12
|
|
9
13
|
# Extract all news links from Apple Daily
|
10
14
|
def news_links
|
@@ -15,7 +19,7 @@ module Hongkong
|
|
15
19
|
link.title = option.text
|
16
20
|
link.url = option["value"]
|
17
21
|
link
|
18
|
-
end
|
22
|
+
end.reject { |l| l.url.nil? }
|
19
23
|
end
|
20
24
|
|
21
25
|
# Extract article from page from Apple Daily
|
@@ -23,13 +27,13 @@ module Hongkong
|
|
23
27
|
visit url
|
24
28
|
|
25
29
|
document = Document.new
|
26
|
-
document.source =
|
27
|
-
document.title =
|
30
|
+
document.source = name
|
31
|
+
document.title = doc.search("#articleContent h1").text.strip
|
28
32
|
document.url = url
|
29
33
|
document.html = html
|
30
34
|
document.content = page.evaluate_script("HongKongNews.getInnerText('#masterContent')")
|
31
35
|
document.screenshot_data = screenshot_data
|
32
|
-
|
36
|
+
document.image_url = doc.search("//meta[@property='og:image']/@content").first.text rescue nil
|
33
37
|
document
|
34
38
|
end
|
35
39
|
end
|
@@ -9,6 +9,10 @@ module Hongkong
|
|
9
9
|
|
10
10
|
LIST_URL = "http://news.mingpao.com/pns/%E6%96%B0%E8%81%9E%E7%B8%BD%E8%A6%BD/web_tc/archive/latest"
|
11
11
|
|
12
|
+
def name
|
13
|
+
"mingpao"
|
14
|
+
end
|
15
|
+
|
12
16
|
# Extract all news links from Mingpao
|
13
17
|
def news_links
|
14
18
|
visit LIST_URL
|
@@ -29,13 +33,13 @@ module Hongkong
|
|
29
33
|
first("article p")
|
30
34
|
|
31
35
|
document = Document.new
|
32
|
-
document.source =
|
33
|
-
document.title =
|
36
|
+
document.source = name
|
37
|
+
document.title = doc.search("h1").text
|
34
38
|
document.url = url
|
35
39
|
document.html = html
|
36
40
|
document.content = page.evaluate_script("HongKongNews.getInnerText('article')")
|
37
41
|
document.screenshot_data = screenshot_data
|
38
|
-
|
42
|
+
document.image_url = doc.search("//meta[@property='og:image']/@content").first.text rescue nil
|
39
43
|
document
|
40
44
|
end
|
41
45
|
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require_relative './phantom_scraper'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Hongkong
|
5
|
+
module News
|
6
|
+
module Scrapers
|
7
|
+
class OrientalDailyScraper
|
8
|
+
include PhantomScraper
|
9
|
+
|
10
|
+
LIST_URL = "http://orientaldaily.on.cc/"
|
11
|
+
|
12
|
+
def name
|
13
|
+
"orientaldaily"
|
14
|
+
end
|
15
|
+
|
16
|
+
# Extract all news links
|
17
|
+
def news_links
|
18
|
+
visit LIST_URL
|
19
|
+
|
20
|
+
all("#articleListSELECT option").collect do |option|
|
21
|
+
link = Link.new
|
22
|
+
link.title = option.text
|
23
|
+
link.url = URI::join(LIST_URL, option["value"]).to_s
|
24
|
+
link
|
25
|
+
end.reject { |l| l.url.to_s.end_with?("#") }
|
26
|
+
end
|
27
|
+
|
28
|
+
# Extract article from page
|
29
|
+
def news(url)
|
30
|
+
visit url
|
31
|
+
|
32
|
+
# wait for content to be loaded
|
33
|
+
first("#contentCTN-right")
|
34
|
+
|
35
|
+
document = Document.new
|
36
|
+
document.source = name
|
37
|
+
document.title = doc.search("h1").text
|
38
|
+
document.url = url
|
39
|
+
document.html = html
|
40
|
+
document.content = page.evaluate_script("HongKongNews.getInnerText('#contentCTN-top')") + "\n" + page.evaluate_script("HongKongNews.getInnerText('#contentCTN-right')")
|
41
|
+
document.screenshot_data = screenshot_data
|
42
|
+
|
43
|
+
image = doc.search("#contentCTN .photo img").first
|
44
|
+
document.image_url = URI::join(url, image["src"]).to_s if image
|
45
|
+
|
46
|
+
document
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'capybara/poltergeist'
|
2
|
+
require 'nokogiri'
|
2
3
|
require 'tempfile'
|
3
4
|
|
4
5
|
module Hongkong
|
@@ -45,6 +46,14 @@ module Hongkong
|
|
45
46
|
def html
|
46
47
|
page.html
|
47
48
|
end
|
49
|
+
|
50
|
+
# Get a Nokogiri Document for current page
|
51
|
+
def doc
|
52
|
+
unless @doc
|
53
|
+
@doc = Nokogiri::HTML(html)
|
54
|
+
end
|
55
|
+
@doc
|
56
|
+
end
|
48
57
|
end
|
49
58
|
end
|
50
59
|
end
|
@@ -4,4 +4,5 @@ require "hongkong/news/models/document"
|
|
4
4
|
require "hongkong/news/models/link"
|
5
5
|
|
6
6
|
require "hongkong/news/scrapers/apple_daily_scraper"
|
7
|
-
require "hongkong/news/scrapers/mingpao_scraper"
|
7
|
+
require "hongkong/news/scrapers/mingpao_scraper"
|
8
|
+
require "hongkong/news/scrapers/oriental_daily_scraper"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hongkong-news-scrapers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francis Chong
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: nokogiri
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
description: Scrape Hong Kong news for good.
|
98
112
|
email:
|
99
113
|
- francis@ignition.hk
|
@@ -118,6 +132,7 @@ files:
|
|
118
132
|
- lib/hongkong/news/scrapers.rb
|
119
133
|
- lib/hongkong/news/scrapers/apple_daily_scraper.rb
|
120
134
|
- lib/hongkong/news/scrapers/mingpao_scraper.rb
|
135
|
+
- lib/hongkong/news/scrapers/oriental_daily_scraper.rb
|
121
136
|
- lib/hongkong/news/scrapers/phantom_scraper.rb
|
122
137
|
- lib/hongkong/news/scrapers/phantom_scraper_extension.js
|
123
138
|
- lib/hongkong/news/scrapers/version.rb
|