bnext_robot 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9c225a805567ccb4cb9db98087c3b6c09bd83f12
4
+ data.tar.gz: 52872627905196a8ea0b3d8683881c783f4a35e8
5
+ SHA512:
6
+ metadata.gz: dcfe0e44c660d74c4213a5f9826ecd2cd02a5cc03509c966419a400123d3f7b8d09167badb70c9b571842b9bb92d8980d9fd64cd4dc305ccfb54371b2a996723
7
+ data.tar.gz: 634dd5d0f6d7e594e670e3b7c1295c2371e0539a95392a7ab091a0bfddf2546f3d24dc5ddd69e6cd0ee8a0b3ec1b79c6c1fbf4584396867eeb915f76fa4726dd
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ # A sample Gemfile
2
+ source 'https://rubygems.org'
3
+
4
+ # gem "rails"
5
+ gem 'oga'
6
+ gem 'xpath'
7
+ gem 'vcr'
8
+ gem 'minitest'
9
+ gem 'webmock'
data/Gemfile.lock ADDED
@@ -0,0 +1,40 @@
1
+ GEM
2
+ remote: https://rubygems.org/
3
+ specs:
4
+ addressable (2.3.8)
5
+ ansi (1.5.0)
6
+ ast (2.1.0)
7
+ crack (0.4.2)
8
+ safe_yaml (~> 1.0.0)
9
+ hashdiff (0.2.2)
10
+ mini_portile (0.6.2)
11
+ minitest (5.8.1)
12
+ nokogiri (1.6.6.2)
13
+ mini_portile (~> 0.6.0)
14
+ oga (1.3.1)
15
+ ast
16
+ ruby-ll (~> 2.1)
17
+ ruby-ll (2.1.2)
18
+ ansi
19
+ ast
20
+ safe_yaml (1.0.4)
21
+ vcr (2.9.3)
22
+ webmock (1.22.1)
23
+ addressable (>= 2.3.6)
24
+ crack (>= 0.3.2)
25
+ hashdiff
26
+ xpath (2.0.0)
27
+ nokogiri (~> 1.3)
28
+
29
+ PLATFORMS
30
+ ruby
31
+
32
+ DEPENDENCIES
33
+ minitest
34
+ oga
35
+ vcr
36
+ webmock
37
+ xpath
38
+
39
+ BUNDLED WITH
40
+ 1.10.6
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ task :default => [:spec]
4
+
5
+ desc 'Run specs'
6
+ Rake::TestTask.new(name=:spec) do |t|
7
+ t.pattern = 'spec/class_spec/*_spec.rb'
8
+ end
data/bin/bnext_robot ADDED
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ require_relative '../lib/ext_class/bnext_robot'
3
+
4
+ fail ArgumentError, "Usage: bnext_robot [week/day]\n" if ARGV.count == 0
5
+
6
+ bnext_robot = BNextRobot.new
7
+
8
+ type = ARGV[0]
9
+ if type == "week"
10
+ bnext_robot.show_week_rank
11
+ elsif type == "day"
12
+ bnext_robot.show_day_rank
13
+ else
14
+ puts "Please type week or day"
15
+ end
@@ -0,0 +1,22 @@
1
+ $LOAD_PATH.push File.expand_path('../robot', __FILE__)
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = 'bnext_robot'
5
+ s.version = '0.1.0'
6
+ s.date = '2015-10-18'
7
+ s.executables << 'bnext_robot'
8
+ s.summary = 'Web scrapy for Business Next'
9
+ s.description = 'Web scrapy for Business Next, including showing day/week rank and feeds extraction'
10
+ s.authors = ['Jacky Pan', 'Angela Hung', 'Edison Lee', 'Tony Lee']
11
+ s.email = ['jackypan000@gmail.com', 'angela.hung@iss.nthu.edu.tw', 'dfg1021@hotmail.com.tw', 'tony123930@yahoo.com.tw']
12
+ s.files = `git ls-files`.split("\n")
13
+ s.test_files = `git ls-files spec/*`.split("\n")
14
+ s.homepage = 'https://github.com/SOA-Upstart4/Team-HW-1-Ideate-and-Scrape'
15
+ s.license = 'MIT'
16
+
17
+ s.add_development_dependency 'minitest'
18
+ s.add_development_dependency 'minitest-rg'
19
+ s.add_development_dependency 'vcr'
20
+ s.add_development_dependency 'webmock'
21
+ s.add_runtime_dependency 'oga'
22
+ end
@@ -0,0 +1,96 @@
1
+ require_relative '../int_module/crawl_runner'
2
+ require_relative '../int_module/feeds_filter'
3
+ require_relative '../int_class/feed'
4
+ require_relative '../int_class/filter_condition'
5
+ require 'oga'
6
+ require 'open-uri'
7
+
8
+ # BNextRobot Extract titles and links of daily/ weekly hot feeds.
9
+ class BNextRobot
10
+ include Crawler
11
+ include FeedFilter
12
+
13
+ FEED_XPATH = "//a[contains(@class, 'item_title block_link')]/@href"
14
+ TITLE_XPATH = "//div[contains(@class, 'main_title')]"
15
+ TAG_XPATH = "//a[contains(@class, 'tag_link')]"
16
+ INFO_XPATH = "//span[contains(@class, 'info')]"
17
+ CONTENT_XPATH = "//div[contains(@class, 'content htmlview')]"
18
+ IMGS_XPATH = "//div[contains(@class, 'content htmlview')]/p/img/@src"
19
+
20
+ attr_accessor :day_rank_feeds, :week_rank_feeds
21
+
22
+ def initialize
23
+ load_page('http://www.bnext.com.tw/')
24
+ analyze
25
+ init_rank_feeds
26
+ end
27
+
28
+ def analyze
29
+ cat_tags = @web_data.scan(/<li>.*?<\/li>/)
30
+ atags = cat_tags.map { |x| x.match(/<a.*?<\/a>/).to_s }
31
+ hrefs = atags.map { |x| x.match(/href=\".*?\"/).to_s[7..-2] }
32
+ cat_names = atags.map { |x| x.match(/>.+?</).to_s[1..-2] }
33
+ cats_pair = cat_names.zip(hrefs).select { |n, ref| ref.start_with? 'categories' }
34
+
35
+ @cats = Hash.new(false)
36
+ cats_pair.map { |n, ref| @cats[n] = @domain + ref }
37
+ nil
38
+ end
39
+
40
+ def show_day_rank
41
+ @day_rank_feeds.map { |feed| puts "#{feed.title}: #{feed.link}" }
42
+ nil
43
+ end
44
+
45
+ def show_week_rank
46
+ @week_rank_feeds.map { |feed| puts "#{feed.title}: #{feed.link}" }
47
+ nil
48
+ end
49
+
50
+ def init_rank_feeds
51
+ token_gen = ["//div[@id = '", "_rank']//a[@class = 'content']"]
52
+ document = Oga.parse_html(@web_data)
53
+
54
+ day_rank_hrefs = document.xpath(token_gen.join('day') + '/@href').map(&:text)
55
+ week_rank_hrefs = document.xpath(token_gen.join('week') + '/@href').map(&:text)
56
+
57
+ day_rank_titles = document.xpath(token_gen.join('day')).map(&:text)
58
+ week_rank_titles = document.xpath(token_gen.join('week')).map(&:text)
59
+
60
+ day_rank = day_rank_titles.zip(day_rank_hrefs).select { |title, href| href.start_with? '/' }
61
+ day_rank = day_rank.map { |title, href| [title, @domain + href[1..-1]] }
62
+ week_rank = week_rank_titles.zip(week_rank_hrefs).select { |title, href| href.start_with? '/' }
63
+ week_rank = week_rank.map { |title, href| [title, @domain + href[1..-1]] }
64
+
65
+ @day_rank_feeds = day_rank.map { |title, href| Feed.new(title, "", "", [], href, "") }
66
+ @week_rank_feeds = week_rank.map { |title, href| Feed.new(title, "", "", [], href, "") }
67
+ nil
68
+ end
69
+
70
+ def get_feeds(cat, page_no)
71
+ # TODO: parse all feeds @ page: page_no
72
+ query_url = @domain + "categories/#{cat}/?p=#{page_no}"
73
+ document = Oga.parse_html(open(query_url))
74
+ path = document.xpath(FEED_XPATH).map(&:text)
75
+ # path.each do |feed_id|
76
+ # feed = _extract_feed(feed_id)
77
+ # puts "Title: #{feed.title}"
78
+ # puts "Author: #{feed.author}"
79
+ # puts "Date: #{feed.date}"
80
+ # puts "Tags: " + feed.tags.join(", ")
81
+ # end
82
+ path.map { |feed_id| _extract_feed(feed_id) }
83
+ end
84
+
85
+ def _extract_feed(feed_id)
86
+ query_url = @domain[0..-2] + "#{feed_id}"
87
+ document = Oga.parse_html(open(query_url))
88
+ title = document.xpath(TITLE_XPATH).text
89
+ author = document.xpath(INFO_XPATH)[0].text.gsub('撰文者:'.force_encoding('ascii-8bit'), '')
90
+ date = document.xpath(INFO_XPATH)[1].text.gsub('發表日期:'.force_encoding('ascii-8bit'), '')
91
+ content = document.xpath(CONTENT_XPATH).text
92
+ tags = document.xpath(TAG_XPATH).map(&:text)
93
+ imgs = document.xpath(IMGS_XPATH).map(&:text)
94
+ Feed.new(title, author, date, tags, query_url, content, imgs)
95
+ end
96
+ end
@@ -0,0 +1,23 @@
1
+
2
+
3
+ class Feed
4
+
5
+ attr_accessor :title, :author, :date, :tags, :link, :content, :imgs
6
+
7
+ def initialize( title='', author='', date='', tags=[], link='', content='', imgs=[] )
8
+ @title = title
9
+ @author = author
10
+ @date = date
11
+ @tags = tags
12
+ @link = link
13
+ @content = content
14
+ @imgs = imgs
15
+ end
16
+
17
+ def to_hash()
18
+ symbs = %w(title author date tags link content imgs)
19
+ values = [@title, @author, @date, @tags, @link, @content, @imgs]
20
+ Hash[symbs.zip(values)]
21
+ end
22
+
23
+ end
@@ -0,0 +1,41 @@
1
+
2
+
3
+ class FilterCondition
4
+
5
+ attr_accessor :conds
6
+
7
+ def initialize()
8
+ @conds = Hash.new( false )
9
+ end
10
+
11
+ def date_must_before( date )
12
+ @conds[ "date_must_before" ] = date
13
+ self
14
+ end
15
+
16
+ def date_must_after( date )
17
+ @conds[ "date_must_after" ] = date
18
+ self
19
+ end
20
+
21
+ def tags_must_include( tags )
22
+ @conds[ "tags_must_include" ] = tags
23
+ self
24
+ end
25
+
26
+ def tags_must_exclude( tags )
27
+ @conds[ "tags_must_exclude" ] = tags
28
+ self
29
+ end
30
+
31
+ def title_must_include( terms )
32
+ @conds[ "title_must_include" ] = terms
33
+ self
34
+ end
35
+
36
+ def designated_authors( authors )
37
+ @conds[ "designated_authors" ] = authors
38
+ self
39
+ end
40
+
41
+ end
@@ -0,0 +1,26 @@
1
+ require 'open-uri'
2
+
3
+ module Crawler
4
+
5
+ attr_accessor :cats, :web_data, :domain
6
+
7
+ def load_page( url )
8
+ begin
9
+ @domain = url
10
+ @domain += "/" unless @domain.end_with? "/"
11
+ open( url ) { |f| @web_data = f.read }
12
+ 1
13
+ rescue
14
+ 0
15
+ end
16
+ end
17
+
18
+ def analyze
19
+ raise NotImplementedError.new("#{self.class.name}#analyze is an abstract method.")
20
+ end
21
+
22
+ def get_feeds( cat, max_num )
23
+ raise NotImplementedError.new("#{self.class.name}#get_feeds is an abstract method.")
24
+ end
25
+
26
+ end
@@ -0,0 +1,11 @@
1
+ require_relative '../int_class/feed'
2
+ require_relative '../int_class/filter_condition'
3
+
4
+ module FeedFilter
5
+
6
+ def filter_feeds( feeds, condition )
7
+ # Implement filtering
8
+ feeds
9
+ end
10
+
11
+ end
@@ -0,0 +1,72 @@
1
+ require 'minitest/autorun'
2
+ require 'vcr'
3
+ require 'webmock/minitest'
4
+ require 'yaml'
5
+ require_relative '../../lib/ext_class/bnext_robot'
6
+
7
+ day_rank = [
8
+ "郭台銘投資製造的雲馬X1被爆抄很大,雲造科技回應:並無專利侵權行為: http://www.bnext.com.tw/article/view/id/37685",
9
+ "各位低持股的老闆皮緊一點!狼來了-談矽品案: http://www.bnext.com.tw/article/view/id/37666",
10
+ "Facebook開暗門,透過背景重新整理功能榨乾iPhone電力: http://www.bnext.com.tw/article/view/id/37684",
11
+ "高招!矽品提收購無效之訴,訴訟若拖2~3年不利日月光: http://www.bnext.com.tw/article/view/id/37672",
12
+ "台灣首次大型VR娛樂應用登場!中華職棒總冠軍賽,Lamigo桃猿熱鬧開打!: http://www.bnext.com.tw/article/view/id/37683",
13
+ "Gogoro創辦人陸學森:過去20年來,我學會的10件事。: http://www.bnext.com.tw/article/view/id/37688",
14
+ "被雷軍及郭台銘都看好的智慧電動車雲馬X1!90後創業家玩出新設計: http://www.bnext.com.tw/article/view/id/37663",
15
+ "台積電版 iPhone 秒殺三星版?別急,來看看這個測試: http://www.bnext.com.tw/ext_rss/view/id/1010354"
16
+ ]
17
+
18
+ week_rank = [
19
+ "台積電勝三星?iPhone 6s 的 A9處理器事件總整理: http://www.bnext.com.tw/ext_rss/view/id/996449",
20
+ "Excel記帳雲端進化!Google表單比記帳App還好用: http://www.bnext.com.tw/ext_rss/view/id/955360",
21
+ "一台iPhone 6s竟有16種版本?性能有差異,消費者只能認了?: http://www.bnext.com.tw/ext_rss/view/id/1002363",
22
+ "傳華碩不滿?微軟自製筆電買氣旺、OEM廠或遭消滅?: http://www.bnext.com.tw/article/view/id/37652",
23
+ "蘋果穩居冠軍、Facebook強勢增長、Paypal首度進榜!解讀2015全球百大品牌: http://www.bnext.com.tw/article/view/id/37607",
24
+ "圖解行動支付兩大模式,你的錢未來這樣用!: http://www.bnext.com.tw/article/view/id/37609",
25
+ "韓流退燒?LG:韓企全球地位動搖、市佔全面敗退: http://www.bnext.com.tw/article/view/id/37624",
26
+ "消費者眼球都在哪?世界即時通訊及社群媒體使用情形分析: http://www.bnext.com.tw/article/view/id/37667"
27
+ ]
28
+
29
+ VCR.configure do |config|
30
+ config.cassette_library_dir = './spec/testfiles/vcr_cassettes'
31
+ config.hook_into :webmock
32
+ end
33
+
34
+ bnext_robot = nil
35
+
36
+ VCR.use_cassette('bnext_mainpage') do
37
+ bnext_robot = BNextRobot.new
38
+
39
+ describe "Get correct day rank articles" do
40
+
41
+ it 'has the right number of daily articles' do
42
+ bnext_robot.day_rank_feeds.size.must_equal day_rank.size
43
+ end
44
+
45
+ it 'has the right content' do
46
+ content = bnext_robot.day_rank_feeds.map { |feed| "#{feed.title.force_encoding("utf-8")}: #{feed.link.force_encoding("utf-8")}" }
47
+ content.must_equal day_rank
48
+ end
49
+ end
50
+
51
+ describe "Get correct week rank articles" do
52
+
53
+ it 'has the right number of daily articles' do
54
+ bnext_robot.week_rank_feeds.size.must_equal week_rank.size
55
+ end
56
+
57
+ it 'has the right content' do
58
+ content = bnext_robot.week_rank_feeds.map { |feed| "#{feed.title.force_encoding("utf-8")}: #{feed.link.force_encoding("utf-8")}" }
59
+ content.must_equal week_rank
60
+ end
61
+ end
62
+ end
63
+
64
+ VCR.use_cassette('bnext_techpage') do
65
+ bnext_tech = bnext_robot.get_feeds("tech", 1)
66
+ describe "Get correct list of each category" do
67
+
68
+ it 'get right number of feeds' do
69
+ bnext_tech.size.must_equal 20
70
+ end
71
+ end
72
+ end