hongkong-news-scrapers 0.4.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/hongkong/news/scrapers/apple_daily_scraper.rb +6 -1
- data/lib/hongkong/news/scrapers/mingpao_scraper.rb +6 -1
- data/lib/hongkong/news/scrapers/oriental_daily_scraper.rb +5 -1
- data/lib/hongkong/news/scrapers/phantom_scraper.rb +24 -0
- data/lib/hongkong/news/scrapers/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bd2a3a978a37ffcf1f0edafaca7b69d379f81983
|
4
|
+
data.tar.gz: 69974d9afb715fda8b72b6a8901e274fbd081dba
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71e08113fbe2282382b4c6260d4b3dcf7c0ec89797f98bfd886d409a5840d516e0cc5660ffdf80e15e2bab05764598800882ff68b5c7c48d52771b9bd8abf2d6
|
7
|
+
data.tar.gz: 46e8d11e0aaa8c875a18a7f723ac07e90b634734d51ca2e096ef61cf6c663e3da65aca36df4d42f606ad11dab2244adcd1b7cc76787448d75b831d4db98507b7
|
@@ -14,12 +14,15 @@ module Hongkong
|
|
14
14
|
def news_links
|
15
15
|
visit "http://hk.apple.nextmedia.com/"
|
16
16
|
|
17
|
-
all("#article_ddl option").collect do |option|
|
17
|
+
links = all("#article_ddl option").collect do |option|
|
18
18
|
link = Link.new
|
19
19
|
link.title = option.text
|
20
20
|
link.url = option["value"]
|
21
21
|
link
|
22
22
|
end.reject { |l| l.url.nil? }
|
23
|
+
|
24
|
+
cleanup
|
25
|
+
links
|
23
26
|
end
|
24
27
|
|
25
28
|
# Extract article from page from Apple Daily
|
@@ -34,6 +37,8 @@ module Hongkong
|
|
34
37
|
document.content = page.evaluate_script("HongKongNews.getInnerText('#masterContent')")
|
35
38
|
document.screenshot_data = screenshot_data
|
36
39
|
document.image_url = doc.search("//meta[@property='og:image']/@content").first.text rescue nil
|
40
|
+
|
41
|
+
cleanup
|
37
42
|
document
|
38
43
|
end
|
39
44
|
end
|
@@ -17,12 +17,15 @@ module Hongkong
|
|
17
17
|
def news_links
|
18
18
|
visit LIST_URL
|
19
19
|
|
20
|
-
all(".listing ul li a").collect do |anchor|
|
20
|
+
links = all(".listing ul li a").collect do |anchor|
|
21
21
|
link = Link.new
|
22
22
|
link.title = anchor.text
|
23
23
|
link.url = URI::join(LIST_URL, anchor["href"]).to_s
|
24
24
|
link
|
25
25
|
end
|
26
|
+
|
27
|
+
cleanup
|
28
|
+
links
|
26
29
|
end
|
27
30
|
|
28
31
|
# Extract article from page from Mingpao
|
@@ -40,6 +43,8 @@ module Hongkong
|
|
40
43
|
document.content = page.evaluate_script("HongKongNews.getInnerText('article')")
|
41
44
|
document.screenshot_data = screenshot_data
|
42
45
|
document.image_url = doc.search("//meta[@property='og:image']/@content").first.text rescue nil
|
46
|
+
|
47
|
+
cleanup
|
43
48
|
document
|
44
49
|
end
|
45
50
|
end
|
@@ -17,12 +17,15 @@ module Hongkong
|
|
17
17
|
def news_links
|
18
18
|
visit LIST_URL
|
19
19
|
|
20
|
-
all("#articleListSELECT option").collect do |option|
|
20
|
+
links = all("#articleListSELECT option").collect do |option|
|
21
21
|
link = Link.new
|
22
22
|
link.title = option.text
|
23
23
|
link.url = URI::join(LIST_URL, option["value"]).to_s
|
24
24
|
link
|
25
25
|
end.reject { |l| l.url.to_s.end_with?("#") }
|
26
|
+
|
27
|
+
cleanup
|
28
|
+
links
|
26
29
|
end
|
27
30
|
|
28
31
|
# Extract article from page
|
@@ -43,6 +46,7 @@ module Hongkong
|
|
43
46
|
image = doc.search("#contentCTN .photo img").first
|
44
47
|
document.image_url = URI::join(url, image["src"]).to_s if image
|
45
48
|
|
49
|
+
cleanup
|
46
50
|
document
|
47
51
|
end
|
48
52
|
end
|
@@ -58,6 +58,30 @@ module Hongkong
|
|
58
58
|
end
|
59
59
|
@doc
|
60
60
|
end
|
61
|
+
|
62
|
+
# call when shutdown phantomjs
|
63
|
+
def cleanup
|
64
|
+
wait_for_ajax
|
65
|
+
page.driver.reset!
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
# workaround for hang phantomjs
|
71
|
+
def wait_for_ajax
|
72
|
+
Timeout.timeout(Capybara.default_wait_time) do
|
73
|
+
loop until finished_all_ajax_requests?
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def finished_all_ajax_requests?
|
78
|
+
begin
|
79
|
+
page.evaluate_script("(typeof jQuery !== \"undefined\") ? jQuery.active : 0").zero?
|
80
|
+
rescue Exception => e
|
81
|
+
puts "ignored excpetion wiating ajax: #{e}"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
61
85
|
end
|
62
86
|
end
|
63
87
|
end
|