blogbot 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/blogbot.gemspec +2 -2
- data/lib/blogbot.rb +4 -15
- data/lib/blogbot/extraction.rb +17 -5
- data/lib/blogbot/navigation.rb +5 -20
- data/lib/blogbot/reflection.rb +1 -1
- data/test/blogbot.rb +59 -0
- data/test/blogbot/extraction.rb +39 -0
- data/test/blogbot/memorization.rb +25 -0
- data/test/blogbot/navigation.rb +62 -0
- data/test/blogbot/reflection.rb +47 -0
- metadata +13 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ba38caa6667e36e2396162590ecf399cc604a3a
|
4
|
+
data.tar.gz: acd1158109ad0e5c749e2ba8083a3a28f51ba6af
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41f3cd5d74c3fa5ab90d3bbc4453add4c59216d49985b248701303c123c71b9c12f3e95db9ec8ce18fc06bd2fe3a1dbbd6fea458de8fa46b1024297fd3c6569c
|
7
|
+
data.tar.gz: 1b69e32e940fa81e48f5e13b6339859f1dd0d32f317e5f42f71b94a3465899918ef66bed864bbf84751a06b90806819aa1064eabf885caaa8aa47344a90f05c8
|
data/blogbot.gemspec
CHANGED
data/lib/blogbot.rb
CHANGED
@@ -6,17 +6,6 @@ require 'blogbot/memorization'
|
|
6
6
|
require 'blogbot/navigation'
|
7
7
|
require 'blogbot/reflection'
|
8
8
|
|
9
|
-
# MASTER GAME PLAN:
|
10
|
-
#
|
11
|
-
# 1) accept target site
|
12
|
-
# 2) scan for keyword popular
|
13
|
-
# a- if on homepage and nothing comes up, find blogpage then scan again
|
14
|
-
# b- if nothing is coming up, scan sitemap.xml, etc for keyword popular
|
15
|
-
# 3) determine div id/class of where keyword popular is located
|
16
|
-
# 4) extract all hyperlinks and store in hash
|
17
|
-
# 5) index??
|
18
|
-
#
|
19
|
-
|
20
9
|
# Make a new blogbot that can scan pages and extract the most popular links!
|
21
10
|
class Blogbot
|
22
11
|
include Extraction
|
@@ -45,7 +34,7 @@ class Blogbot
|
|
45
34
|
set_agent
|
46
35
|
@current_page = nil
|
47
36
|
@current_element = nil
|
48
|
-
@popular_links =
|
37
|
+
@popular_links = []
|
49
38
|
@indicator = nil
|
50
39
|
end
|
51
40
|
|
@@ -59,12 +48,12 @@ class Blogbot
|
|
59
48
|
set_agent
|
60
49
|
@current_page = nil
|
61
50
|
@current_element = nil
|
62
|
-
@popular_links =
|
51
|
+
@popular_links = []
|
63
52
|
@indicator = nil
|
64
53
|
end
|
65
54
|
|
66
55
|
def ignorance_error
|
67
|
-
|
68
|
-
or this bot isn't smart enough to extract this site yet
|
56
|
+
fail "Sorry, either there are no popular links present
|
57
|
+
or this bot isn't smart enough to extract this site yet\n"
|
69
58
|
end
|
70
59
|
end
|
data/lib/blogbot/extraction.rb
CHANGED
@@ -8,12 +8,24 @@ module Extraction
|
|
8
8
|
puts 'Not enough links to extract' if see_multiple_links? == false
|
9
9
|
|
10
10
|
@current_element.css('a').each do |a|
|
11
|
-
next if a.text == '' || a['href'] == '#'
|
11
|
+
next if a.text == '' || a['href'] == '#'
|
12
12
|
title = a.text
|
13
13
|
link = a['href']
|
14
|
-
|
14
|
+
hash = {title: title, link: link}
|
15
|
+
@popular_links << hash
|
15
16
|
end
|
16
|
-
|
17
|
+
end
|
18
|
+
|
19
|
+
def display_links
|
20
|
+
puts "-"*50
|
21
|
+
@popular_links.each do |hash|
|
22
|
+
hash.each do |k, v|
|
23
|
+
puts "#{k.upcase}: #{v}"
|
24
|
+
end
|
25
|
+
puts
|
26
|
+
end
|
27
|
+
puts "-"*50
|
28
|
+
@popular_links
|
17
29
|
end
|
18
30
|
|
19
31
|
def extract(url)
|
@@ -21,7 +33,7 @@ module Extraction
|
|
21
33
|
puts "\nExtracting ...\n"
|
22
34
|
scan url
|
23
35
|
locate_popular_links
|
24
|
-
extract_links
|
25
|
-
@popular_links.nil? == true ? simple_error :
|
36
|
+
extract_links
|
37
|
+
@popular_links.nil? == true ? simple_error : display_links
|
26
38
|
end
|
27
39
|
end
|
data/lib/blogbot/navigation.rb
CHANGED
@@ -1,14 +1,12 @@
|
|
1
1
|
# Adds capability to navigate through posts pages.
|
2
2
|
module Navigation
|
3
|
-
# TODO: add method to auto navigate parents until proper links are present
|
4
|
-
|
5
3
|
# Navigates to posts page based off of store_posts_url.
|
6
4
|
def go_to_posts_page
|
7
5
|
find_posts_url
|
8
6
|
store_posts_url
|
9
7
|
@current_page = @posts_url.click
|
10
8
|
end
|
11
|
-
|
9
|
+
|
12
10
|
# Navigates to previous Mechanize page.
|
13
11
|
def previous_page
|
14
12
|
@current_page = @agent.get(@agent.back['href'])
|
@@ -21,6 +19,7 @@ module Navigation
|
|
21
19
|
else
|
22
20
|
store_indicator
|
23
21
|
@current_element = @indicator
|
22
|
+
@current_element
|
24
23
|
end
|
25
24
|
end
|
26
25
|
|
@@ -38,25 +37,11 @@ module Navigation
|
|
38
37
|
end
|
39
38
|
end
|
40
39
|
|
41
|
-
####
|
42
|
-
# FIXME: this selects the first child and can lead
|
43
|
-
# bot into rabit hole. Only use when at lower levels of html.
|
44
|
-
# Need to change this to an iteration in the future.
|
45
|
-
#
|
46
|
-
# Changes current Nokogiri element to first child.
|
47
|
-
def descend
|
48
|
-
if @current_element.children.empty? == true # no more room to descend
|
49
|
-
puts 'At lowest element. Nothing left to descend.'
|
50
|
-
else
|
51
|
-
@current_element = @current_element.child
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
40
|
def auto_ascend
|
56
41
|
ascend until see_multiple_links? == true
|
57
42
|
@current_element
|
58
43
|
end
|
59
|
-
|
44
|
+
|
60
45
|
def locate_popular_links
|
61
46
|
if possible_success? == false
|
62
47
|
ignorance_error
|
@@ -67,8 +52,8 @@ module Navigation
|
|
67
52
|
see_popular? == true ? crawl_popular : ignorance_error
|
68
53
|
end
|
69
54
|
end
|
70
|
-
|
71
|
-
# Examine popular element and climb DOM tree until multiple
|
55
|
+
|
56
|
+
# Examine popular element and climb DOM tree until multiple
|
72
57
|
# links are present.
|
73
58
|
def crawl_popular
|
74
59
|
go_to_popular
|
data/lib/blogbot/reflection.rb
CHANGED
data/test/blogbot.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'mechanize'
|
4
|
+
require 'blogbot/extraction'
|
5
|
+
require 'blogbot/memorization'
|
6
|
+
require 'blogbot/navigation'
|
7
|
+
require 'blogbot/reflection'
|
8
|
+
|
9
|
+
# Make a new blogbot that can scan pages and extract the most popular links!
|
10
|
+
class Blogbot
|
11
|
+
include Extraction
|
12
|
+
include Memorization
|
13
|
+
include Navigation
|
14
|
+
include Reflection
|
15
|
+
|
16
|
+
attr_accessor(
|
17
|
+
:agent,
|
18
|
+
:article_link,
|
19
|
+
:blog_link,
|
20
|
+
:current_element,
|
21
|
+
:current_page,
|
22
|
+
:indicator,
|
23
|
+
:parent,
|
24
|
+
:posts_url,
|
25
|
+
:target_url
|
26
|
+
)
|
27
|
+
|
28
|
+
def set_agent
|
29
|
+
self.agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize
|
33
|
+
puts 'Powering up the rubatron generators!!'
|
34
|
+
set_agent
|
35
|
+
@current_page = nil
|
36
|
+
@current_element = nil
|
37
|
+
@popular_links = []
|
38
|
+
@indicator = nil
|
39
|
+
end
|
40
|
+
|
41
|
+
# GET a page using Mechanize and set to current page.
|
42
|
+
def scan(url)
|
43
|
+
@target_url = url
|
44
|
+
@current_page = @agent.get(@target_url)
|
45
|
+
end
|
46
|
+
|
47
|
+
def reset
|
48
|
+
set_agent
|
49
|
+
@current_page = nil
|
50
|
+
@current_element = nil
|
51
|
+
@popular_links = []
|
52
|
+
@indicator = nil
|
53
|
+
end
|
54
|
+
|
55
|
+
def ignorance_error
|
56
|
+
fail "Sorry, either there are no popular links present
|
57
|
+
or this bot isn't smart enough to extract this site yet\n"
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Adds capability to extract data in an organized format from webpage.
|
2
|
+
module Extraction
|
3
|
+
####
|
4
|
+
# Extracts titles and hyperlinks from element being examined.
|
5
|
+
# If the text is an empty '' it's an <img>.
|
6
|
+
# Images are typically duplicate links and ok to skip.
|
7
|
+
def extract_links
|
8
|
+
puts 'Not enough links to extract' if see_multiple_links? == false
|
9
|
+
|
10
|
+
@current_element.css('a').each do |a|
|
11
|
+
next if a.text == '' || a['href'] == '#'
|
12
|
+
title = a.text
|
13
|
+
link = a['href']
|
14
|
+
hash = {title: title, link: link}
|
15
|
+
@popular_links << hash
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def display_links
|
20
|
+
puts "-"*50
|
21
|
+
@popular_links.each do |hash|
|
22
|
+
hash.each do |k, v|
|
23
|
+
puts "#{k.upcase}: #{v}"
|
24
|
+
end
|
25
|
+
puts
|
26
|
+
end
|
27
|
+
puts "-"*50
|
28
|
+
@popular_links
|
29
|
+
end
|
30
|
+
|
31
|
+
def extract(url)
|
32
|
+
reset
|
33
|
+
puts "\nExtracting ...\n"
|
34
|
+
scan url
|
35
|
+
locate_popular_links
|
36
|
+
extract_links
|
37
|
+
@popular_links.nil? == true ? simple_error : display_links
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Adds capability to memorize things such as URLs, etc.
|
2
|
+
# Most variables are stored here.
|
3
|
+
module Memorization
|
4
|
+
# Searches page for link that says Articles or Blog.
|
5
|
+
def find_posts_url
|
6
|
+
@article_link = @current_page.link_with(text: /Articles/)
|
7
|
+
@blog_link = @current_page.link_with(text: /Blog/)
|
8
|
+
end
|
9
|
+
|
10
|
+
# Memorize posts_url found by find_posts.
|
11
|
+
def store_posts_url
|
12
|
+
@posts_url =
|
13
|
+
case
|
14
|
+
when @article_link.nil? == false
|
15
|
+
@article_link
|
16
|
+
when @blog_link.nil? == false
|
17
|
+
@blog_link
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Sets search indicator to whatever had 'Popular' in its text.
|
22
|
+
def store_indicator
|
23
|
+
@indicator = @current_page.search("[text()*='Popular']").first
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
# Adds capability to navigate through posts pages.
|
2
|
+
module Navigation
|
3
|
+
# Navigates to posts page based off of store_posts_url.
|
4
|
+
def go_to_posts_page
|
5
|
+
find_posts_url
|
6
|
+
store_posts_url
|
7
|
+
@current_page = @posts_url.click
|
8
|
+
end
|
9
|
+
|
10
|
+
# Navigates to previous Mechanize page.
|
11
|
+
def previous_page
|
12
|
+
@current_page = @agent.get(@agent.back['href'])
|
13
|
+
end
|
14
|
+
|
15
|
+
# Sets current element to 'Popular' indicator.
|
16
|
+
def go_to_popular
|
17
|
+
if see_popular? == false
|
18
|
+
puts 'Nothing says "Popular" on this page'
|
19
|
+
else
|
20
|
+
store_indicator
|
21
|
+
@current_element = @indicator
|
22
|
+
@current_element
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns parent of current Nokogiri element
|
27
|
+
def find_parent
|
28
|
+
@parent = @current_element.parent # one element higher, the div container
|
29
|
+
end
|
30
|
+
|
31
|
+
# Changes current Nokogiri element to its parent.
|
32
|
+
def ascend
|
33
|
+
if @current_element.ancestors.empty? == true # no more room to ascend
|
34
|
+
puts 'At highest element. Nothing left to ascend.'
|
35
|
+
else
|
36
|
+
@current_element = @current_element.parent
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def auto_ascend
|
41
|
+
ascend until see_multiple_links? == true
|
42
|
+
@current_element
|
43
|
+
end
|
44
|
+
|
45
|
+
def locate_popular_links
|
46
|
+
if possible_success? == false
|
47
|
+
ignorance_error
|
48
|
+
elsif see_popular? == true
|
49
|
+
crawl_popular
|
50
|
+
elsif see_posts? == true
|
51
|
+
go_to_posts_page
|
52
|
+
see_popular? == true ? crawl_popular : ignorance_error
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Examine popular element and climb DOM tree until multiple
|
57
|
+
# links are present.
|
58
|
+
def crawl_popular
|
59
|
+
go_to_popular
|
60
|
+
auto_ascend
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Adds capability to examine for posts, articles, and links.
|
2
|
+
module Reflection
|
3
|
+
# Searches for link that says articles or blog.
|
4
|
+
def see_posts?
|
5
|
+
find_posts_url
|
6
|
+
if @article_link.nil? == true && @blog_link.nil? == true
|
7
|
+
false # => no article links found
|
8
|
+
else
|
9
|
+
true
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# Searches for keyword 'Popular' on current page.
|
14
|
+
def see_popular?
|
15
|
+
search = @current_page.search "[text()*='Popular']"
|
16
|
+
search.empty? ? false : true
|
17
|
+
end
|
18
|
+
|
19
|
+
# Searches for presence of <a> tags in current element.
|
20
|
+
def see_links?
|
21
|
+
@current_element.css('a').empty? ? false : true
|
22
|
+
end
|
23
|
+
|
24
|
+
# Searches for presences of more than two <a> tags in current element.
|
25
|
+
def see_multiple_links?
|
26
|
+
@current_element.css('a').length < 3 ? false : true
|
27
|
+
end
|
28
|
+
|
29
|
+
####
|
30
|
+
# Determines if bot can crawl a Popular section.
|
31
|
+
# If it doesn't see popular at first glance, it searches and navigates
|
32
|
+
# to Posts page (Articles or Blog).
|
33
|
+
#
|
34
|
+
# If Popular section still doesn't exist, it's a no-go.
|
35
|
+
def possible_success?
|
36
|
+
if see_popular? == true
|
37
|
+
true
|
38
|
+
elsif see_posts? == true
|
39
|
+
go_to_posts_page # Changes page to look for popular.
|
40
|
+
answer = see_popular? == true ? true : false # true if popular is present
|
41
|
+
previous_page # Check is complete. Return to original page.
|
42
|
+
answer
|
43
|
+
else
|
44
|
+
false
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: blogbot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Mason
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -37,6 +37,11 @@ files:
|
|
37
37
|
- lib/blogbot/memorization.rb
|
38
38
|
- lib/blogbot/navigation.rb
|
39
39
|
- lib/blogbot/reflection.rb
|
40
|
+
- test/blogbot.rb
|
41
|
+
- test/blogbot/extraction.rb
|
42
|
+
- test/blogbot/memorization.rb
|
43
|
+
- test/blogbot/navigation.rb
|
44
|
+
- test/blogbot/reflection.rb
|
40
45
|
homepage: https://github.com/m8ss/blogbot
|
41
46
|
licenses:
|
42
47
|
- MIT
|
@@ -61,4 +66,9 @@ rubygems_version: 2.4.6
|
|
61
66
|
signing_key:
|
62
67
|
specification_version: 4
|
63
68
|
summary: The internet is full of noise. Only read the best.
|
64
|
-
test_files:
|
69
|
+
test_files:
|
70
|
+
- test/blogbot.rb
|
71
|
+
- test/blogbot/extraction.rb
|
72
|
+
- test/blogbot/memorization.rb
|
73
|
+
- test/blogbot/navigation.rb
|
74
|
+
- test/blogbot/reflection.rb
|