bnext_robot 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +40 -0
- data/Rakefile +8 -0
- data/bin/bnext_robot +15 -0
- data/bnext_robot.gemspec +22 -0
- data/lib/ext_class/bnext_robot.rb +96 -0
- data/lib/int_class/feed.rb +23 -0
- data/lib/int_class/filter_condition.rb +41 -0
- data/lib/int_module/crawl_runner.rb +26 -0
- data/lib/int_module/feeds_filter.rb +11 -0
- data/spec/class_spec/bnext_robot_spec.rb +72 -0
- data/spec/testfiles/vcr_cassettes/bnext_mainpage.yml +1422 -0
- data/spec/testfiles/vcr_cassettes/bnext_techpage.yml +21639 -0
- metadata +138 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9c225a805567ccb4cb9db98087c3b6c09bd83f12
|
4
|
+
data.tar.gz: 52872627905196a8ea0b3d8683881c783f4a35e8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: dcfe0e44c660d74c4213a5f9826ecd2cd02a5cc03509c966419a400123d3f7b8d09167badb70c9b571842b9bb92d8980d9fd64cd4dc305ccfb54371b2a996723
|
7
|
+
data.tar.gz: 634dd5d0f6d7e594e670e3b7c1295c2371e0539a95392a7ab091a0bfddf2546f3d24dc5ddd69e6cd0ee8a0b3ec1b79c6c1fbf4584396867eeb915f76fa4726dd
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
GEM
|
2
|
+
remote: https://rubygems.org/
|
3
|
+
specs:
|
4
|
+
addressable (2.3.8)
|
5
|
+
ansi (1.5.0)
|
6
|
+
ast (2.1.0)
|
7
|
+
crack (0.4.2)
|
8
|
+
safe_yaml (~> 1.0.0)
|
9
|
+
hashdiff (0.2.2)
|
10
|
+
mini_portile (0.6.2)
|
11
|
+
minitest (5.8.1)
|
12
|
+
nokogiri (1.6.6.2)
|
13
|
+
mini_portile (~> 0.6.0)
|
14
|
+
oga (1.3.1)
|
15
|
+
ast
|
16
|
+
ruby-ll (~> 2.1)
|
17
|
+
ruby-ll (2.1.2)
|
18
|
+
ansi
|
19
|
+
ast
|
20
|
+
safe_yaml (1.0.4)
|
21
|
+
vcr (2.9.3)
|
22
|
+
webmock (1.22.1)
|
23
|
+
addressable (>= 2.3.6)
|
24
|
+
crack (>= 0.3.2)
|
25
|
+
hashdiff
|
26
|
+
xpath (2.0.0)
|
27
|
+
nokogiri (~> 1.3)
|
28
|
+
|
29
|
+
PLATFORMS
|
30
|
+
ruby
|
31
|
+
|
32
|
+
DEPENDENCIES
|
33
|
+
minitest
|
34
|
+
oga
|
35
|
+
vcr
|
36
|
+
webmock
|
37
|
+
xpath
|
38
|
+
|
39
|
+
BUNDLED WITH
|
40
|
+
1.10.6
|
data/Rakefile
ADDED
data/bin/bnext_robot
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require_relative '../lib/ext_class/bnext_robot'
|
3
|
+
|
4
|
+
fail ArgumentError, "Usage: bnext_robot [week/day]\n" if ARGV.count == 0
|
5
|
+
|
6
|
+
bnext_robot = BNextRobot.new
|
7
|
+
|
8
|
+
type = ARGV[0]
|
9
|
+
if type == "week"
|
10
|
+
bnext_robot.show_week_rank
|
11
|
+
elsif type == "day"
|
12
|
+
bnext_robot.show_day_rank
|
13
|
+
else
|
14
|
+
puts "Please type week or day"
|
15
|
+
end
|
data/bnext_robot.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
$LOAD_PATH.push File.expand_path('../robot', __FILE__)
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = 'bnext_robot'
|
5
|
+
s.version = '0.1.0'
|
6
|
+
s.date = '2015-10-18'
|
7
|
+
s.executables << 'bnext_robot'
|
8
|
+
s.summary = 'Web scrapy for Business Next'
|
9
|
+
s.description = 'Web scrapy for Business Next, including showing day/week rank and feeds extraction'
|
10
|
+
s.authors = ['Jacky Pan', 'Angela Hung', 'Edison Lee', 'Tony Lee']
|
11
|
+
s.email = ['jackypan000@gmail.com', 'angela.hung@iss.nthu.edu.tw', 'dfg1021@hotmail.com.tw', 'tony123930@yahoo.com.tw']
|
12
|
+
s.files = `git ls-files`.split("\n")
|
13
|
+
s.test_files = `git ls-files spec/*`.split("\n")
|
14
|
+
s.homepage = 'https://github.com/SOA-Upstart4/Team-HW-1-Ideate-and-Scrape'
|
15
|
+
s.license = 'MIT'
|
16
|
+
|
17
|
+
s.add_development_dependency 'minitest'
|
18
|
+
s.add_development_dependency 'minitest-rg'
|
19
|
+
s.add_development_dependency 'vcr'
|
20
|
+
s.add_development_dependency 'webmock'
|
21
|
+
s.add_runtime_dependency 'oga'
|
22
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require_relative '../int_module/crawl_runner'
|
2
|
+
require_relative '../int_module/feeds_filter'
|
3
|
+
require_relative '../int_class/feed'
|
4
|
+
require_relative '../int_class/filter_condition'
|
5
|
+
require 'oga'
|
6
|
+
require 'open-uri'
|
7
|
+
|
8
|
+
# BNextRobot Extract titles and links of daily/ weekly hot feeds.
|
9
|
+
class BNextRobot
|
10
|
+
include Crawler
|
11
|
+
include FeedFilter
|
12
|
+
|
13
|
+
FEED_XPATH = "//a[contains(@class, 'item_title block_link')]/@href"
|
14
|
+
TITLE_XPATH = "//div[contains(@class, 'main_title')]"
|
15
|
+
TAG_XPATH = "//a[contains(@class, 'tag_link')]"
|
16
|
+
INFO_XPATH = "//span[contains(@class, 'info')]"
|
17
|
+
CONTENT_XPATH = "//div[contains(@class, 'content htmlview')]"
|
18
|
+
IMGS_XPATH = "//div[contains(@class, 'content htmlview')]/p/img/@src"
|
19
|
+
|
20
|
+
attr_accessor :day_rank_feeds, :week_rank_feeds
|
21
|
+
|
22
|
+
def initialize
|
23
|
+
load_page('http://www.bnext.com.tw/')
|
24
|
+
analyze
|
25
|
+
init_rank_feeds
|
26
|
+
end
|
27
|
+
|
28
|
+
def analyze
|
29
|
+
cat_tags = @web_data.scan(/<li>.*?<\/li>/)
|
30
|
+
atags = cat_tags.map { |x| x.match(/<a.*?<\/a>/).to_s }
|
31
|
+
hrefs = atags.map { |x| x.match(/href=\".*?\"/).to_s[7..-2] }
|
32
|
+
cat_names = atags.map { |x| x.match(/>.+?</).to_s[1..-2] }
|
33
|
+
cats_pair = cat_names.zip(hrefs).select { |n, ref| ref.start_with? 'categories' }
|
34
|
+
|
35
|
+
@cats = Hash.new(false)
|
36
|
+
cats_pair.map { |n, ref| @cats[n] = @domain + ref }
|
37
|
+
nil
|
38
|
+
end
|
39
|
+
|
40
|
+
def show_day_rank
|
41
|
+
@day_rank_feeds.map { |feed| puts "#{feed.title}: #{feed.link}" }
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
|
45
|
+
def show_week_rank
|
46
|
+
@week_rank_feeds.map { |feed| puts "#{feed.title}: #{feed.link}" }
|
47
|
+
nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def init_rank_feeds
|
51
|
+
token_gen = ["//div[@id = '", "_rank']//a[@class = 'content']"]
|
52
|
+
document = Oga.parse_html(@web_data)
|
53
|
+
|
54
|
+
day_rank_hrefs = document.xpath(token_gen.join('day') + '/@href').map(&:text)
|
55
|
+
week_rank_hrefs = document.xpath(token_gen.join('week') + '/@href').map(&:text)
|
56
|
+
|
57
|
+
day_rank_titles = document.xpath(token_gen.join('day')).map(&:text)
|
58
|
+
week_rank_titles = document.xpath(token_gen.join('week')).map(&:text)
|
59
|
+
|
60
|
+
day_rank = day_rank_titles.zip(day_rank_hrefs).select { |title, href| href.start_with? '/' }
|
61
|
+
day_rank = day_rank.map { |title, href| [title, @domain + href[1..-1]] }
|
62
|
+
week_rank = week_rank_titles.zip(week_rank_hrefs).select { |title, href| href.start_with? '/' }
|
63
|
+
week_rank = week_rank.map { |title, href| [title, @domain + href[1..-1]] }
|
64
|
+
|
65
|
+
@day_rank_feeds = day_rank.map { |title, href| Feed.new(title, "", "", [], href, "") }
|
66
|
+
@week_rank_feeds = week_rank.map { |title, href| Feed.new(title, "", "", [], href, "") }
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def get_feeds(cat, page_no)
|
71
|
+
# TODO: parse all feeds @ page: page_no
|
72
|
+
query_url = @domain + "categories/#{cat}/?p=#{page_no}"
|
73
|
+
document = Oga.parse_html(open(query_url))
|
74
|
+
path = document.xpath(FEED_XPATH).map(&:text)
|
75
|
+
# path.each do |feed_id|
|
76
|
+
# feed = _extract_feed(feed_id)
|
77
|
+
# puts "Title: #{feed.title}"
|
78
|
+
# puts "Author: #{feed.author}"
|
79
|
+
# puts "Date: #{feed.date}"
|
80
|
+
# puts "Tags: " + feed.tags.join(", ")
|
81
|
+
# end
|
82
|
+
path.map { |feed_id| _extract_feed(feed_id) }
|
83
|
+
end
|
84
|
+
|
85
|
+
def _extract_feed(feed_id)
|
86
|
+
query_url = @domain[0..-2] + "#{feed_id}"
|
87
|
+
document = Oga.parse_html(open(query_url))
|
88
|
+
title = document.xpath(TITLE_XPATH).text
|
89
|
+
author = document.xpath(INFO_XPATH)[0].text.gsub('撰文者:'.force_encoding('ascii-8bit'), '')
|
90
|
+
date = document.xpath(INFO_XPATH)[1].text.gsub('發表日期:'.force_encoding('ascii-8bit'), '')
|
91
|
+
content = document.xpath(CONTENT_XPATH).text
|
92
|
+
tags = document.xpath(TAG_XPATH).map(&:text)
|
93
|
+
imgs = document.xpath(IMGS_XPATH).map(&:text)
|
94
|
+
Feed.new(title, author, date, tags, query_url, content, imgs)
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class Feed
|
4
|
+
|
5
|
+
attr_accessor :title, :author, :date, :tags, :link, :content, :imgs
|
6
|
+
|
7
|
+
def initialize( title='', author='', date='', tags=[], link='', content='', imgs=[] )
|
8
|
+
@title = title
|
9
|
+
@author = author
|
10
|
+
@date = date
|
11
|
+
@tags = tags
|
12
|
+
@link = link
|
13
|
+
@content = content
|
14
|
+
@imgs = imgs
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_hash()
|
18
|
+
symbs = %w(title author date tags link content imgs)
|
19
|
+
values = [@title, @author, @date, @tags, @link, @content, @imgs]
|
20
|
+
Hash[symbs.zip(values)]
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
class FilterCondition
|
4
|
+
|
5
|
+
attr_accessor :conds
|
6
|
+
|
7
|
+
def initialize()
|
8
|
+
@conds = Hash.new( false )
|
9
|
+
end
|
10
|
+
|
11
|
+
def date_must_before( date )
|
12
|
+
@conds[ "date_must_before" ] = date
|
13
|
+
self
|
14
|
+
end
|
15
|
+
|
16
|
+
def date_must_after( date )
|
17
|
+
@conds[ "date_must_after" ] = date
|
18
|
+
self
|
19
|
+
end
|
20
|
+
|
21
|
+
def tags_must_include( tags )
|
22
|
+
@conds[ "tags_must_include" ] = tags
|
23
|
+
self
|
24
|
+
end
|
25
|
+
|
26
|
+
def tags_must_exclude( tags )
|
27
|
+
@conds[ "tags_must_exclude" ] = tags
|
28
|
+
self
|
29
|
+
end
|
30
|
+
|
31
|
+
def title_must_include( terms )
|
32
|
+
@conds[ "title_must_include" ] = terms
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
def designated_authors( authors )
|
37
|
+
@conds[ "designated_authors" ] = authors
|
38
|
+
self
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
|
3
|
+
module Crawler
|
4
|
+
|
5
|
+
attr_accessor :cats, :web_data, :domain
|
6
|
+
|
7
|
+
def load_page( url )
|
8
|
+
begin
|
9
|
+
@domain = url
|
10
|
+
@domain += "/" unless @domain.end_with? "/"
|
11
|
+
open( url ) { |f| @web_data = f.read }
|
12
|
+
1
|
13
|
+
rescue
|
14
|
+
0
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def analyze
|
19
|
+
raise NotImplementedError.new("#{self.class.name}#analyze is an abstract method.")
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_feeds( cat, max_num )
|
23
|
+
raise NotImplementedError.new("#{self.class.name}#get_feeds is an abstract method.")
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'vcr'
|
3
|
+
require 'webmock/minitest'
|
4
|
+
require 'yaml'
|
5
|
+
require_relative '../../lib/ext_class/bnext_robot'
|
6
|
+
|
7
|
+
day_rank = [
|
8
|
+
"郭台銘投資製造的雲馬X1被爆抄很大,雲造科技回應:並無專利侵權行為: http://www.bnext.com.tw/article/view/id/37685",
|
9
|
+
"各位低持股的老闆皮緊一點!狼來了-談矽品案: http://www.bnext.com.tw/article/view/id/37666",
|
10
|
+
"Facebook開暗門,透過背景重新整理功能榨乾iPhone電力: http://www.bnext.com.tw/article/view/id/37684",
|
11
|
+
"高招!矽品提收購無效之訴,訴訟若拖2~3年不利日月光: http://www.bnext.com.tw/article/view/id/37672",
|
12
|
+
"台灣首次大型VR娛樂應用登場!中華職棒總冠軍賽,Lamigo桃猿熱鬧開打!: http://www.bnext.com.tw/article/view/id/37683",
|
13
|
+
"Gogoro創辦人陸學森:過去20年來,我學會的10件事。: http://www.bnext.com.tw/article/view/id/37688",
|
14
|
+
"被雷軍及郭台銘都看好的智慧電動車雲馬X1!90後創業家玩出新設計: http://www.bnext.com.tw/article/view/id/37663",
|
15
|
+
"台積電版 iPhone 秒殺三星版?別急,來看看這個測試: http://www.bnext.com.tw/ext_rss/view/id/1010354"
|
16
|
+
]
|
17
|
+
|
18
|
+
week_rank = [
|
19
|
+
"台積電勝三星?iPhone 6s 的 A9處理器事件總整理: http://www.bnext.com.tw/ext_rss/view/id/996449",
|
20
|
+
"Excel記帳雲端進化!Google表單比記帳App還好用: http://www.bnext.com.tw/ext_rss/view/id/955360",
|
21
|
+
"一台iPhone 6s竟有16種版本?性能有差異,消費者只能認了?: http://www.bnext.com.tw/ext_rss/view/id/1002363",
|
22
|
+
"傳華碩不滿?微軟自製筆電買氣旺、OEM廠或遭消滅?: http://www.bnext.com.tw/article/view/id/37652",
|
23
|
+
"蘋果穩居冠軍、Facebook強勢增長、Paypal首度進榜!解讀2015全球百大品牌: http://www.bnext.com.tw/article/view/id/37607",
|
24
|
+
"圖解行動支付兩大模式,你的錢未來這樣用!: http://www.bnext.com.tw/article/view/id/37609",
|
25
|
+
"韓流退燒?LG:韓企全球地位動搖、市佔全面敗退: http://www.bnext.com.tw/article/view/id/37624",
|
26
|
+
"消費者眼球都在哪?世界即時通訊及社群媒體使用情形分析: http://www.bnext.com.tw/article/view/id/37667"
|
27
|
+
]
|
28
|
+
|
29
|
+
VCR.configure do |config|
|
30
|
+
config.cassette_library_dir = './spec/testfiles/vcr_cassettes'
|
31
|
+
config.hook_into :webmock
|
32
|
+
end
|
33
|
+
|
34
|
+
bnext_robot = nil
|
35
|
+
|
36
|
+
VCR.use_cassette('bnext_mainpage') do
|
37
|
+
bnext_robot = BNextRobot.new
|
38
|
+
|
39
|
+
describe "Get correct day rank articles" do
|
40
|
+
|
41
|
+
it 'has the right number of daily articles' do
|
42
|
+
bnext_robot.day_rank_feeds.size.must_equal day_rank.size
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'has the right content' do
|
46
|
+
content = bnext_robot.day_rank_feeds.map { |feed| "#{feed.title.force_encoding("utf-8")}: #{feed.link.force_encoding("utf-8")}" }
|
47
|
+
content.must_equal day_rank
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "Get correct week rank articles" do
|
52
|
+
|
53
|
+
it 'has the right number of daily articles' do
|
54
|
+
bnext_robot.week_rank_feeds.size.must_equal week_rank.size
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'has the right content' do
|
58
|
+
content = bnext_robot.week_rank_feeds.map { |feed| "#{feed.title.force_encoding("utf-8")}: #{feed.link.force_encoding("utf-8")}" }
|
59
|
+
content.must_equal week_rank
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
VCR.use_cassette('bnext_techpage') do
|
65
|
+
bnext_tech = bnext_robot.get_feeds("tech", 1)
|
66
|
+
describe "Get correct list of each category" do
|
67
|
+
|
68
|
+
it 'get right number of feeds' do
|
69
|
+
bnext_tech.size.must_equal 20
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|