blogbot 0.0.1 → 0.0.3.beta
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/blogbot.gemspec +2 -2
- data/lib/blogbot.rb +65 -2
- data/lib/blogbot/extraction.rb +27 -0
- data/lib/blogbot/memorization.rb +25 -0
- data/lib/blogbot/navigation.rb +77 -0
- data/lib/blogbot/reflection.rb +47 -0
- metadata +8 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6c5e8d092c22a8a7b833f4ff9ca2316233f83052
|
4
|
+
data.tar.gz: b40ef4f4c864f045738d6ec5befcc57a2bf0cd26
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2ac8cc16f576800bcd01160da49ef160c6905ac3a0a1e109c4b3e619b08878149ef3411eaec67c48f9d96e80716cfc9ad610e1a84935261d856f7d361ebbe1ed
|
7
|
+
data.tar.gz: 6db095cf79251f28abc45b99194b6a80afec1dd921d029459e2bd9d23c8b643c8e75737559d77f49f5e075f33a3b9ccff5a3a3bb341bc8c32d65dfba20f5e10b
|
data/README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# blogbot
|
2
2
|
|
3
|
-
Bot that
|
3
|
+
Bot that extracts the most popular articles from websites.
|
4
4
|
|
5
5
|
## DESCRIPTION
|
6
6
|
|
7
|
-
The internet is full of noise. Only read the best
|
7
|
+
The internet is full of noise. Only read the best.
|
8
8
|
|
9
9
|
## DISCLAIMER
|
10
10
|
|
data/blogbot.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |gem|
|
4
4
|
gem.name = 'blogbot'
|
5
|
-
gem.version = '0.0.
|
5
|
+
gem.version = '0.0.3.beta'
|
6
6
|
gem.date = '2015-09-22'
|
7
7
|
gem.platform = Gem::Platform::RUBY
|
8
8
|
gem.required_ruby_version = '>= 1.8'
|
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
|
|
14
14
|
gem.description = 'Bot that crawls the most popular articles from websites. '
|
15
15
|
gem.authors = ['John Mason']
|
16
16
|
gem.email = 'mace2345@gmail.com'
|
17
|
-
gem.homepage = 'https://github.com/m8ss/
|
17
|
+
gem.homepage = 'https://github.com/m8ss/blogbot'
|
18
18
|
gem.license = 'MIT'
|
19
19
|
|
20
20
|
gem.add_runtime_dependency('mechanize', '~> 2.7')
|
data/lib/blogbot.rb
CHANGED
@@ -1,7 +1,70 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
require 'mechanize'
|
4
|
+
require './blogbot/extraction'
|
5
|
+
require './blogbot/memorization'
|
6
|
+
require './blogbot/navigation'
|
7
|
+
require './blogbot/reflection'
|
4
8
|
|
5
|
-
#
|
6
|
-
|
9
|
+
# MASTER GAME PLAN:
|
10
|
+
#
|
11
|
+
# 1) accept target site
|
12
|
+
# 2) scan for keyword popular
|
13
|
+
# a- if on homepage and nothing comes up, find blogpage then scan again
|
14
|
+
# b- if nothing is coming up, scan sitemap.xml, etc for keyword popular
|
15
|
+
# 3) determine div id/class of where keyword popular is located
|
16
|
+
# 4) extract all hyperlinks and store in hash
|
17
|
+
# 5) index??
|
18
|
+
#
|
19
|
+
|
20
|
+
# Make a new blogbot that can scan pages and extract the most popular links!
|
21
|
+
class Blogbot
|
22
|
+
include Extraction
|
23
|
+
include Memorization
|
24
|
+
include Navigation
|
25
|
+
include Reflection
|
26
|
+
|
27
|
+
attr_accessor(
|
28
|
+
:agent,
|
29
|
+
:article_link,
|
30
|
+
:blog_link,
|
31
|
+
:current_element,
|
32
|
+
:current_page,
|
33
|
+
:indicator,
|
34
|
+
:parent,
|
35
|
+
:posts_url,
|
36
|
+
:target_url
|
37
|
+
)
|
38
|
+
|
39
|
+
def set_agent
|
40
|
+
self.agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize
|
44
|
+
puts 'Powering up the rubatron generators!!'
|
45
|
+
set_agent
|
46
|
+
@current_page = nil
|
47
|
+
@current_element = nil
|
48
|
+
@popular_links = {}
|
49
|
+
@indicator = nil
|
50
|
+
end
|
51
|
+
|
52
|
+
# GET a page using Mechanize and set to current page.
|
53
|
+
def scan(url)
|
54
|
+
@target_url = url
|
55
|
+
@current_page = @agent.get(@target_url)
|
56
|
+
end
|
57
|
+
|
58
|
+
def reset
|
59
|
+
set_agent
|
60
|
+
@current_page = nil
|
61
|
+
@current_element = nil
|
62
|
+
@popular_links = {}
|
63
|
+
@indicator = nil
|
64
|
+
end
|
65
|
+
|
66
|
+
def ignorance_error
|
67
|
+
raise "Sorry, either there are no popular links present
|
68
|
+
or this bot isn't smart enough to extract this site yet/n"
|
69
|
+
end
|
7
70
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# Adds capability to extract data in an organized format from webpage.
|
2
|
+
module Extraction
|
3
|
+
####
|
4
|
+
# Extracts titles and hyperlinks from element being examined.
|
5
|
+
# If the text is an empty '' it's an <img>.
|
6
|
+
# Images are typically duplicate links and ok to skip.
|
7
|
+
def extract_links
|
8
|
+
puts 'Not enough links to extract' if see_multiple_links? == false
|
9
|
+
|
10
|
+
@current_element.css('a').each do |a|
|
11
|
+
next if a.text == '' || a['href'] == '#'
|
12
|
+
title = a.text
|
13
|
+
link = a['href']
|
14
|
+
@popular_links[link] = title
|
15
|
+
end
|
16
|
+
@popular_links # Returns entire hash.
|
17
|
+
end
|
18
|
+
|
19
|
+
def extract(url)
|
20
|
+
reset
|
21
|
+
puts "\nExtracting ...\n"
|
22
|
+
scan url
|
23
|
+
locate_popular_links
|
24
|
+
extract_links
|
25
|
+
@popular_links.nil? == true ? simple_error : @popular_links
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Adds capability to memorize things such as URLs, etc.
|
2
|
+
# Most variables are stored here.
|
3
|
+
module Memorization
|
4
|
+
# Searches page for link that says Articles or Blog.
|
5
|
+
def find_posts_url
|
6
|
+
@article_link = @current_page.link_with(text: /Articles/)
|
7
|
+
@blog_link = @current_page.link_with(text: /Blog/)
|
8
|
+
end
|
9
|
+
|
10
|
+
# Memorize posts_url found by find_posts.
|
11
|
+
def store_posts_url
|
12
|
+
@posts_url =
|
13
|
+
case
|
14
|
+
when @article_link.nil? == false
|
15
|
+
@article_link
|
16
|
+
when @blog_link.nil? == false
|
17
|
+
@blog_link
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Sets search indicator to whatever had 'Popular' in its text.
|
22
|
+
def store_indicator
|
23
|
+
@indicator = @current_page.search("[text()*='Popular']").first
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# Adds capability to navigate through posts pages.
|
2
|
+
module Navigation
|
3
|
+
# TODO: add method to auto navigate parents until proper links are present
|
4
|
+
|
5
|
+
# Navigates to posts page based off of store_posts_url.
|
6
|
+
def go_to_posts_page
|
7
|
+
find_posts_url
|
8
|
+
store_posts_url
|
9
|
+
@current_page = @posts_url.click
|
10
|
+
end
|
11
|
+
|
12
|
+
# Navigates to previous Mechanize page.
|
13
|
+
def previous_page
|
14
|
+
@current_page = @agent.get(@agent.back['href'])
|
15
|
+
end
|
16
|
+
|
17
|
+
# Sets current element to 'Popular' indicator.
|
18
|
+
def go_to_popular
|
19
|
+
if see_popular? == false
|
20
|
+
puts 'Nothing says "Popular" on this page'
|
21
|
+
else
|
22
|
+
store_indicator
|
23
|
+
@current_element = @indicator
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Returns parent of current Nokogiri element
|
28
|
+
def find_parent
|
29
|
+
@parent = @current_element.parent # one element higher, the div container
|
30
|
+
end
|
31
|
+
|
32
|
+
# Changes current Nokogiri element to its parent.
|
33
|
+
def ascend
|
34
|
+
if @current_element.ancestors.empty? == true # no more room to ascend
|
35
|
+
puts 'At highest element. Nothing left to ascend.'
|
36
|
+
else
|
37
|
+
@current_element = @current_element.parent
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
####
|
42
|
+
# FIXME: this selects the first child and can lead
|
43
|
+
# bot into rabit hole. Only use when at lower levels of html.
|
44
|
+
# Need to change this to an iteration in the future.
|
45
|
+
#
|
46
|
+
# Changes current Nokogiri element to first child.
|
47
|
+
def descend
|
48
|
+
if @current_element.children.empty? == true # no more room to descend
|
49
|
+
puts 'At lowest element. Nothing left to descend.'
|
50
|
+
else
|
51
|
+
@current_element = @current_element.child
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def auto_ascend
|
56
|
+
ascend until see_multiple_links? == true
|
57
|
+
@current_element
|
58
|
+
end
|
59
|
+
|
60
|
+
def locate_popular_links
|
61
|
+
if possible_success? == false
|
62
|
+
ignorance_error
|
63
|
+
elsif see_popular? == true
|
64
|
+
crawl_popular
|
65
|
+
elsif see_posts? == true
|
66
|
+
go_to_posts_page
|
67
|
+
see_popular? == true ? crawl_popular : ignorance_error
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Examine popular element and climb DOM tree until multiple
|
72
|
+
# links are present.
|
73
|
+
def crawl_popular
|
74
|
+
go_to_popular
|
75
|
+
auto_ascend
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Adds capability to examine for posts, articles, and links.
|
2
|
+
module Reflection
|
3
|
+
# Searches for link that says articles or blog.
|
4
|
+
def see_posts?
|
5
|
+
find_posts_url
|
6
|
+
if @article_link.nil? == true && @blog_link.nil? == true
|
7
|
+
false # => no article links found
|
8
|
+
else
|
9
|
+
true
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# Searches for keyword 'Popular' on current page.
|
14
|
+
def see_popular?
|
15
|
+
search = @current_page.search "[text()*='Popular']"
|
16
|
+
search.empty? ? false : true
|
17
|
+
end
|
18
|
+
|
19
|
+
# Searches for presence of <a> tags in current element.
|
20
|
+
def see_links?
|
21
|
+
@current_element.css('a').empty? ? false : true
|
22
|
+
end
|
23
|
+
|
24
|
+
# Searches for presences of more than two <a> tags in current element.
|
25
|
+
def see_multiple_links?
|
26
|
+
@current_element.css('a').length < 3 ? false : true
|
27
|
+
end
|
28
|
+
|
29
|
+
####
|
30
|
+
# Determines if bot can crawl a Popular section.
|
31
|
+
# If it doesn't see popular at first glance, it searches and navigates
|
32
|
+
# to Posts page (Articles or Blog).
|
33
|
+
#
|
34
|
+
# If Popular section still doesn't exist, it's a no-go.
|
35
|
+
def possible_success?
|
36
|
+
if see_popular? == true
|
37
|
+
true
|
38
|
+
elsif see_posts? == true
|
39
|
+
go_to_posts_page # Changes page to look for popular.
|
40
|
+
answer = see_popular? == true ? true : false # true if popular is present
|
41
|
+
previous_page # Check is complete. Return to original page.
|
42
|
+
answer
|
43
|
+
else
|
44
|
+
false
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: blogbot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3.beta
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Mason
|
@@ -33,7 +33,11 @@ files:
|
|
33
33
|
- README.md
|
34
34
|
- blogbot.gemspec
|
35
35
|
- lib/blogbot.rb
|
36
|
-
|
36
|
+
- lib/blogbot/extraction.rb
|
37
|
+
- lib/blogbot/memorization.rb
|
38
|
+
- lib/blogbot/navigation.rb
|
39
|
+
- lib/blogbot/reflection.rb
|
40
|
+
homepage: https://github.com/m8ss/blogbot
|
37
41
|
licenses:
|
38
42
|
- MIT
|
39
43
|
metadata: {}
|
@@ -48,9 +52,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
48
52
|
version: '1.8'
|
49
53
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
54
|
requirements:
|
51
|
-
- - "
|
55
|
+
- - ">"
|
52
56
|
- !ruby/object:Gem::Version
|
53
|
-
version:
|
57
|
+
version: 1.3.1
|
54
58
|
requirements: []
|
55
59
|
rubyforge_project:
|
56
60
|
rubygems_version: 2.4.6
|