blogbot 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/blogbot.gemspec +2 -2
- data/lib/blogbot.rb +4 -15
- data/lib/blogbot/extraction.rb +17 -5
- data/lib/blogbot/navigation.rb +5 -20
- data/lib/blogbot/reflection.rb +1 -1
- data/test/blogbot.rb +59 -0
- data/test/blogbot/extraction.rb +39 -0
- data/test/blogbot/memorization.rb +25 -0
- data/test/blogbot/navigation.rb +62 -0
- data/test/blogbot/reflection.rb +47 -0
- metadata +13 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ba38caa6667e36e2396162590ecf399cc604a3a
|
4
|
+
data.tar.gz: acd1158109ad0e5c749e2ba8083a3a28f51ba6af
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41f3cd5d74c3fa5ab90d3bbc4453add4c59216d49985b248701303c123c71b9c12f3e95db9ec8ce18fc06bd2fe3a1dbbd6fea458de8fa46b1024297fd3c6569c
|
7
|
+
data.tar.gz: 1b69e32e940fa81e48f5e13b6339859f1dd0d32f317e5f42f71b94a3465899918ef66bed864bbf84751a06b90806819aa1064eabf885caaa8aa47344a90f05c8
|
data/blogbot.gemspec
CHANGED
data/lib/blogbot.rb
CHANGED
@@ -6,17 +6,6 @@ require 'blogbot/memorization'
|
|
6
6
|
require 'blogbot/navigation'
|
7
7
|
require 'blogbot/reflection'
|
8
8
|
|
9
|
-
# MASTER GAME PLAN:
|
10
|
-
#
|
11
|
-
# 1) accept target site
|
12
|
-
# 2) scan for keyword popular
|
13
|
-
# a- if on homepage and nothing comes up, find blogpage then scan again
|
14
|
-
# b- if nothing is coming up, scan sitemap.xml, etc for keyword popular
|
15
|
-
# 3) determine div id/class of where keyword popular is located
|
16
|
-
# 4) extract all hyperlinks and store in hash
|
17
|
-
# 5) index??
|
18
|
-
#
|
19
|
-
|
20
9
|
# Make a new blogbot that can scan pages and extract the most popular links!
|
21
10
|
class Blogbot
|
22
11
|
include Extraction
|
@@ -45,7 +34,7 @@ class Blogbot
|
|
45
34
|
set_agent
|
46
35
|
@current_page = nil
|
47
36
|
@current_element = nil
|
48
|
-
@popular_links =
|
37
|
+
@popular_links = []
|
49
38
|
@indicator = nil
|
50
39
|
end
|
51
40
|
|
@@ -59,12 +48,12 @@ class Blogbot
|
|
59
48
|
set_agent
|
60
49
|
@current_page = nil
|
61
50
|
@current_element = nil
|
62
|
-
@popular_links =
|
51
|
+
@popular_links = []
|
63
52
|
@indicator = nil
|
64
53
|
end
|
65
54
|
|
66
55
|
def ignorance_error
|
67
|
-
|
68
|
-
or this bot isn't smart enough to extract this site yet
|
56
|
+
fail "Sorry, either there are no popular links present
|
57
|
+
or this bot isn't smart enough to extract this site yet\n"
|
69
58
|
end
|
70
59
|
end
|
data/lib/blogbot/extraction.rb
CHANGED
@@ -8,12 +8,24 @@ module Extraction
|
|
8
8
|
puts 'Not enough links to extract' if see_multiple_links? == false
|
9
9
|
|
10
10
|
@current_element.css('a').each do |a|
|
11
|
-
next if a.text == '' || a['href'] == '#'
|
11
|
+
next if a.text == '' || a['href'] == '#'
|
12
12
|
title = a.text
|
13
13
|
link = a['href']
|
14
|
-
|
14
|
+
hash = {title: title, link: link}
|
15
|
+
@popular_links << hash
|
15
16
|
end
|
16
|
-
|
17
|
+
end
|
18
|
+
|
19
|
+
def display_links
|
20
|
+
puts "-"*50
|
21
|
+
@popular_links.each do |hash|
|
22
|
+
hash.each do |k, v|
|
23
|
+
puts "#{k.upcase}: #{v}"
|
24
|
+
end
|
25
|
+
puts
|
26
|
+
end
|
27
|
+
puts "-"*50
|
28
|
+
@popular_links
|
17
29
|
end
|
18
30
|
|
19
31
|
def extract(url)
|
@@ -21,7 +33,7 @@ module Extraction
|
|
21
33
|
puts "\nExtracting ...\n"
|
22
34
|
scan url
|
23
35
|
locate_popular_links
|
24
|
-
extract_links
|
25
|
-
@popular_links.nil? == true ? simple_error :
|
36
|
+
extract_links
|
37
|
+
@popular_links.nil? == true ? simple_error : display_links
|
26
38
|
end
|
27
39
|
end
|
data/lib/blogbot/navigation.rb
CHANGED
@@ -1,14 +1,12 @@
|
|
1
1
|
# Adds capability to navigate through posts pages.
|
2
2
|
module Navigation
|
3
|
-
# TODO: add method to auto navigate parents until proper links are present
|
4
|
-
|
5
3
|
# Navigates to posts page based off of store_posts_url.
|
6
4
|
def go_to_posts_page
|
7
5
|
find_posts_url
|
8
6
|
store_posts_url
|
9
7
|
@current_page = @posts_url.click
|
10
8
|
end
|
11
|
-
|
9
|
+
|
12
10
|
# Navigates to previous Mechanize page.
|
13
11
|
def previous_page
|
14
12
|
@current_page = @agent.get(@agent.back['href'])
|
@@ -21,6 +19,7 @@ module Navigation
|
|
21
19
|
else
|
22
20
|
store_indicator
|
23
21
|
@current_element = @indicator
|
22
|
+
@current_element
|
24
23
|
end
|
25
24
|
end
|
26
25
|
|
@@ -38,25 +37,11 @@ module Navigation
|
|
38
37
|
end
|
39
38
|
end
|
40
39
|
|
41
|
-
####
|
42
|
-
# FIXME: this selects the first child and can lead
|
43
|
-
# bot into rabit hole. Only use when at lower levels of html.
|
44
|
-
# Need to change this to an iteration in the future.
|
45
|
-
#
|
46
|
-
# Changes current Nokogiri element to first child.
|
47
|
-
def descend
|
48
|
-
if @current_element.children.empty? == true # no more room to descend
|
49
|
-
puts 'At lowest element. Nothing left to descend.'
|
50
|
-
else
|
51
|
-
@current_element = @current_element.child
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
40
|
def auto_ascend
|
56
41
|
ascend until see_multiple_links? == true
|
57
42
|
@current_element
|
58
43
|
end
|
59
|
-
|
44
|
+
|
60
45
|
def locate_popular_links
|
61
46
|
if possible_success? == false
|
62
47
|
ignorance_error
|
@@ -67,8 +52,8 @@ module Navigation
|
|
67
52
|
see_popular? == true ? crawl_popular : ignorance_error
|
68
53
|
end
|
69
54
|
end
|
70
|
-
|
71
|
-
# Examine popular element and climb DOM tree until multiple
|
55
|
+
|
56
|
+
# Examine popular element and climb DOM tree until multiple
|
72
57
|
# links are present.
|
73
58
|
def crawl_popular
|
74
59
|
go_to_popular
|
data/lib/blogbot/reflection.rb
CHANGED
data/test/blogbot.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'mechanize'
|
4
|
+
require 'blogbot/extraction'
|
5
|
+
require 'blogbot/memorization'
|
6
|
+
require 'blogbot/navigation'
|
7
|
+
require 'blogbot/reflection'
|
8
|
+
|
9
|
+
# Make a new blogbot that can scan pages and extract the most popular links!
|
10
|
+
class Blogbot
|
11
|
+
include Extraction
|
12
|
+
include Memorization
|
13
|
+
include Navigation
|
14
|
+
include Reflection
|
15
|
+
|
16
|
+
attr_accessor(
|
17
|
+
:agent,
|
18
|
+
:article_link,
|
19
|
+
:blog_link,
|
20
|
+
:current_element,
|
21
|
+
:current_page,
|
22
|
+
:indicator,
|
23
|
+
:parent,
|
24
|
+
:posts_url,
|
25
|
+
:target_url
|
26
|
+
)
|
27
|
+
|
28
|
+
def set_agent
|
29
|
+
self.agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize
|
33
|
+
puts 'Powering up the rubatron generators!!'
|
34
|
+
set_agent
|
35
|
+
@current_page = nil
|
36
|
+
@current_element = nil
|
37
|
+
@popular_links = []
|
38
|
+
@indicator = nil
|
39
|
+
end
|
40
|
+
|
41
|
+
# GET a page using Mechanize and set to current page.
|
42
|
+
def scan(url)
|
43
|
+
@target_url = url
|
44
|
+
@current_page = @agent.get(@target_url)
|
45
|
+
end
|
46
|
+
|
47
|
+
def reset
|
48
|
+
set_agent
|
49
|
+
@current_page = nil
|
50
|
+
@current_element = nil
|
51
|
+
@popular_links = []
|
52
|
+
@indicator = nil
|
53
|
+
end
|
54
|
+
|
55
|
+
def ignorance_error
|
56
|
+
fail "Sorry, either there are no popular links present
|
57
|
+
or this bot isn't smart enough to extract this site yet\n"
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Adds capability to extract data in an organized format from webpage.
|
2
|
+
module Extraction
|
3
|
+
####
|
4
|
+
# Extracts titles and hyperlinks from element being examined.
|
5
|
+
# If the text is an empty '' it's an <img>.
|
6
|
+
# Images are typically duplicate links and ok to skip.
|
7
|
+
def extract_links
|
8
|
+
puts 'Not enough links to extract' if see_multiple_links? == false
|
9
|
+
|
10
|
+
@current_element.css('a').each do |a|
|
11
|
+
next if a.text == '' || a['href'] == '#'
|
12
|
+
title = a.text
|
13
|
+
link = a['href']
|
14
|
+
hash = {title: title, link: link}
|
15
|
+
@popular_links << hash
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def display_links
|
20
|
+
puts "-"*50
|
21
|
+
@popular_links.each do |hash|
|
22
|
+
hash.each do |k, v|
|
23
|
+
puts "#{k.upcase}: #{v}"
|
24
|
+
end
|
25
|
+
puts
|
26
|
+
end
|
27
|
+
puts "-"*50
|
28
|
+
@popular_links
|
29
|
+
end
|
30
|
+
|
31
|
+
def extract(url)
|
32
|
+
reset
|
33
|
+
puts "\nExtracting ...\n"
|
34
|
+
scan url
|
35
|
+
locate_popular_links
|
36
|
+
extract_links
|
37
|
+
@popular_links.nil? == true ? simple_error : display_links
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Adds capability to memorize things such as URLs, etc.
|
2
|
+
# Most variables are stored here.
|
3
|
+
module Memorization
|
4
|
+
# Searches page for link that says Articles or Blog.
|
5
|
+
def find_posts_url
|
6
|
+
@article_link = @current_page.link_with(text: /Articles/)
|
7
|
+
@blog_link = @current_page.link_with(text: /Blog/)
|
8
|
+
end
|
9
|
+
|
10
|
+
# Memorize posts_url found by find_posts.
|
11
|
+
def store_posts_url
|
12
|
+
@posts_url =
|
13
|
+
case
|
14
|
+
when @article_link.nil? == false
|
15
|
+
@article_link
|
16
|
+
when @blog_link.nil? == false
|
17
|
+
@blog_link
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Sets search indicator to whatever had 'Popular' in its text.
|
22
|
+
def store_indicator
|
23
|
+
@indicator = @current_page.search("[text()*='Popular']").first
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# Adds capability to navigate through posts pages.
|
2
|
+
module Navigation
|
3
|
+
# Navigates to posts page based off of store_posts_url.
|
4
|
+
def go_to_posts_page
|
5
|
+
find_posts_url
|
6
|
+
store_posts_url
|
7
|
+
@current_page = @posts_url.click
|
8
|
+
end
|
9
|
+
|
10
|
+
# Navigates to previous Mechanize page.
|
11
|
+
def previous_page
|
12
|
+
@current_page = @agent.get(@agent.back['href'])
|
13
|
+
end
|
14
|
+
|
15
|
+
# Sets current element to 'Popular' indicator.
|
16
|
+
def go_to_popular
|
17
|
+
if see_popular? == false
|
18
|
+
puts 'Nothing says "Popular" on this page'
|
19
|
+
else
|
20
|
+
store_indicator
|
21
|
+
@current_element = @indicator
|
22
|
+
@current_element
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns parent of current Nokogiri element
|
27
|
+
def find_parent
|
28
|
+
@parent = @current_element.parent # one element higher, the div container
|
29
|
+
end
|
30
|
+
|
31
|
+
# Changes current Nokogiri element to its parent.
|
32
|
+
def ascend
|
33
|
+
if @current_element.ancestors.empty? == true # no more room to ascend
|
34
|
+
puts 'At highest element. Nothing left to ascend.'
|
35
|
+
else
|
36
|
+
@current_element = @current_element.parent
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def auto_ascend
|
41
|
+
ascend until see_multiple_links? == true
|
42
|
+
@current_element
|
43
|
+
end
|
44
|
+
|
45
|
+
def locate_popular_links
|
46
|
+
if possible_success? == false
|
47
|
+
ignorance_error
|
48
|
+
elsif see_popular? == true
|
49
|
+
crawl_popular
|
50
|
+
elsif see_posts? == true
|
51
|
+
go_to_posts_page
|
52
|
+
see_popular? == true ? crawl_popular : ignorance_error
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Examine popular element and climb DOM tree until multiple
|
57
|
+
# links are present.
|
58
|
+
def crawl_popular
|
59
|
+
go_to_popular
|
60
|
+
auto_ascend
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Adds capability to examine for posts, articles, and links.
|
2
|
+
module Reflection
|
3
|
+
# Searches for link that says articles or blog.
|
4
|
+
def see_posts?
|
5
|
+
find_posts_url
|
6
|
+
if @article_link.nil? == true && @blog_link.nil? == true
|
7
|
+
false # => no article links found
|
8
|
+
else
|
9
|
+
true
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# Searches for keyword 'Popular' on current page.
|
14
|
+
def see_popular?
|
15
|
+
search = @current_page.search "[text()*='Popular']"
|
16
|
+
search.empty? ? false : true
|
17
|
+
end
|
18
|
+
|
19
|
+
# Searches for presence of <a> tags in current element.
|
20
|
+
def see_links?
|
21
|
+
@current_element.css('a').empty? ? false : true
|
22
|
+
end
|
23
|
+
|
24
|
+
# Searches for presences of more than two <a> tags in current element.
|
25
|
+
def see_multiple_links?
|
26
|
+
@current_element.css('a').length < 3 ? false : true
|
27
|
+
end
|
28
|
+
|
29
|
+
####
|
30
|
+
# Determines if bot can crawl a Popular section.
|
31
|
+
# If it doesn't see popular at first glance, it searches and navigates
|
32
|
+
# to Posts page (Articles or Blog).
|
33
|
+
#
|
34
|
+
# If Popular section still doesn't exist, it's a no-go.
|
35
|
+
def possible_success?
|
36
|
+
if see_popular? == true
|
37
|
+
true
|
38
|
+
elsif see_posts? == true
|
39
|
+
go_to_posts_page # Changes page to look for popular.
|
40
|
+
answer = see_popular? == true ? true : false # true if popular is present
|
41
|
+
previous_page # Check is complete. Return to original page.
|
42
|
+
answer
|
43
|
+
else
|
44
|
+
false
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: blogbot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Mason
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -37,6 +37,11 @@ files:
|
|
37
37
|
- lib/blogbot/memorization.rb
|
38
38
|
- lib/blogbot/navigation.rb
|
39
39
|
- lib/blogbot/reflection.rb
|
40
|
+
- test/blogbot.rb
|
41
|
+
- test/blogbot/extraction.rb
|
42
|
+
- test/blogbot/memorization.rb
|
43
|
+
- test/blogbot/navigation.rb
|
44
|
+
- test/blogbot/reflection.rb
|
40
45
|
homepage: https://github.com/m8ss/blogbot
|
41
46
|
licenses:
|
42
47
|
- MIT
|
@@ -61,4 +66,9 @@ rubygems_version: 2.4.6
|
|
61
66
|
signing_key:
|
62
67
|
specification_version: 4
|
63
68
|
summary: The internet is full of noise. Only read the best.
|
64
|
-
test_files:
|
69
|
+
test_files:
|
70
|
+
- test/blogbot.rb
|
71
|
+
- test/blogbot/extraction.rb
|
72
|
+
- test/blogbot/memorization.rb
|
73
|
+
- test/blogbot/navigation.rb
|
74
|
+
- test/blogbot/reflection.rb
|