blogbot 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 89e4fdb5e3c981f8fc1eba19c63b2643870ff903
4
- data.tar.gz: 637a092abf302128fc5bdd3bfddb04f599b44978
3
+ metadata.gz: 5ba38caa6667e36e2396162590ecf399cc604a3a
4
+ data.tar.gz: acd1158109ad0e5c749e2ba8083a3a28f51ba6af
5
5
  SHA512:
6
- metadata.gz: cfcfc8c1207fc3946302f67d3445668f2ef418dc8c3619a63cd1ae70fcec96c836cab2123d17949bc28c60f1ee2240f2a1f1a269c3ea9db85c0bba60423fa535
7
- data.tar.gz: b4b6f70582599014fa00cad198e15493573350d3db99d3842e657898ac72e19df54e30b7e36ee465230ad93a3632578337ffc36e304b5acbbd6091ee10124ade
6
+ metadata.gz: 41f3cd5d74c3fa5ab90d3bbc4453add4c59216d49985b248701303c123c71b9c12f3e95db9ec8ce18fc06bd2fe3a1dbbd6fea458de8fa46b1024297fd3c6569c
7
+ data.tar.gz: 1b69e32e940fa81e48f5e13b6339859f1dd0d32f317e5f42f71b94a3465899918ef66bed864bbf84751a06b90806819aa1064eabf885caaa8aa47344a90f05c8
@@ -2,8 +2,8 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = 'blogbot'
5
- gem.version = '0.0.4'
6
- gem.date = '2015-09-22'
5
+ gem.version = '0.0.5'
6
+ gem.date = '2015-09-25'
7
7
  gem.platform = Gem::Platform::RUBY
8
8
  gem.required_ruby_version = '>= 1.8'
9
9
 
@@ -6,17 +6,6 @@ require 'blogbot/memorization'
6
6
  require 'blogbot/navigation'
7
7
  require 'blogbot/reflection'
8
8
 
9
- # MASTER GAME PLAN:
10
- #
11
- # 1) accept target site
12
- # 2) scan for keyword popular
13
- # a- if on homepage and nothing comes up, find blogpage then scan again
14
- # b- if nothing is coming up, scan sitemap.xml, etc for keyword popular
15
- # 3) determine div id/class of where keyword popular is located
16
- # 4) extract all hyperlinks and store in hash
17
- # 5) index??
18
- #
19
-
20
9
  # Make a new blogbot that can scan pages and extract the most popular links!
21
10
  class Blogbot
22
11
  include Extraction
@@ -45,7 +34,7 @@ class Blogbot
45
34
  set_agent
46
35
  @current_page = nil
47
36
  @current_element = nil
48
- @popular_links = {}
37
+ @popular_links = []
49
38
  @indicator = nil
50
39
  end
51
40
 
@@ -59,12 +48,12 @@ class Blogbot
59
48
  set_agent
60
49
  @current_page = nil
61
50
  @current_element = nil
62
- @popular_links = {}
51
+ @popular_links = []
63
52
  @indicator = nil
64
53
  end
65
54
 
66
55
  def ignorance_error
67
- raise "Sorry, either there are no popular links present
68
- or this bot isn't smart enough to extract this site yet/n"
56
+ fail "Sorry, either there are no popular links present
57
+ or this bot isn't smart enough to extract this site yet\n"
69
58
  end
70
59
  end
@@ -8,12 +8,24 @@ module Extraction
8
8
  puts 'Not enough links to extract' if see_multiple_links? == false
9
9
 
10
10
  @current_element.css('a').each do |a|
11
- next if a.text == '' || a['href'] == '#'
11
+ next if a.text == '' || a['href'] == '#'
12
12
  title = a.text
13
13
  link = a['href']
14
- @popular_links[link] = title
14
+ hash = {title: title, link: link}
15
+ @popular_links << hash
15
16
  end
16
- @popular_links # Returns entire hash.
17
+ end
18
+
19
+ def display_links
20
+ puts "-"*50
21
+ @popular_links.each do |hash|
22
+ hash.each do |k, v|
23
+ puts "#{k.upcase}: #{v}"
24
+ end
25
+ puts
26
+ end
27
+ puts "-"*50
28
+ @popular_links
17
29
  end
18
30
 
19
31
  def extract(url)
@@ -21,7 +33,7 @@ module Extraction
21
33
  puts "\nExtracting ...\n"
22
34
  scan url
23
35
  locate_popular_links
24
- extract_links
25
- @popular_links.nil? == true ? simple_error : @popular_links
36
+ extract_links
37
+ @popular_links.nil? == true ? simple_error : display_links
26
38
  end
27
39
  end
@@ -1,14 +1,12 @@
1
1
  # Adds capability to navigate through posts pages.
2
2
  module Navigation
3
- # TODO: add method to auto navigate parents until proper links are present
4
-
5
3
  # Navigates to posts page based off of store_posts_url.
6
4
  def go_to_posts_page
7
5
  find_posts_url
8
6
  store_posts_url
9
7
  @current_page = @posts_url.click
10
8
  end
11
-
9
+
12
10
  # Navigates to previous Mechanize page.
13
11
  def previous_page
14
12
  @current_page = @agent.get(@agent.back['href'])
@@ -21,6 +19,7 @@ module Navigation
21
19
  else
22
20
  store_indicator
23
21
  @current_element = @indicator
22
+ @current_element
24
23
  end
25
24
  end
26
25
 
@@ -38,25 +37,11 @@ module Navigation
38
37
  end
39
38
  end
40
39
 
41
- ####
42
- # FIXME: this selects the first child and can lead
43
- # bot into rabit hole. Only use when at lower levels of html.
44
- # Need to change this to an iteration in the future.
45
- #
46
- # Changes current Nokogiri element to first child.
47
- def descend
48
- if @current_element.children.empty? == true # no more room to descend
49
- puts 'At lowest element. Nothing left to descend.'
50
- else
51
- @current_element = @current_element.child
52
- end
53
- end
54
-
55
40
  def auto_ascend
56
41
  ascend until see_multiple_links? == true
57
42
  @current_element
58
43
  end
59
-
44
+
60
45
  def locate_popular_links
61
46
  if possible_success? == false
62
47
  ignorance_error
@@ -67,8 +52,8 @@ module Navigation
67
52
  see_popular? == true ? crawl_popular : ignorance_error
68
53
  end
69
54
  end
70
-
71
- # Examine popular element and climb DOM tree until multiple
55
+
56
+ # Examine popular element and climb DOM tree until multiple
72
57
  # links are present.
73
58
  def crawl_popular
74
59
  go_to_popular
@@ -25,7 +25,7 @@ module Reflection
25
25
  def see_multiple_links?
26
26
  @current_element.css('a').length < 3 ? false : true
27
27
  end
28
-
28
+
29
29
  ####
30
30
  # Determines if bot can crawl a Popular section.
31
31
  # If it doesn't see popular at first glance, it searches and navigates
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'mechanize'
4
+ require 'blogbot/extraction'
5
+ require 'blogbot/memorization'
6
+ require 'blogbot/navigation'
7
+ require 'blogbot/reflection'
8
+
9
+ # Make a new blogbot that can scan pages and extract the most popular links!
10
+ class Blogbot
11
+ include Extraction
12
+ include Memorization
13
+ include Navigation
14
+ include Reflection
15
+
16
+ attr_accessor(
17
+ :agent,
18
+ :article_link,
19
+ :blog_link,
20
+ :current_element,
21
+ :current_page,
22
+ :indicator,
23
+ :parent,
24
+ :posts_url,
25
+ :target_url
26
+ )
27
+
28
+ def set_agent
29
+ self.agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
30
+ end
31
+
32
+ def initialize
33
+ puts 'Powering up the rubatron generators!!'
34
+ set_agent
35
+ @current_page = nil
36
+ @current_element = nil
37
+ @popular_links = []
38
+ @indicator = nil
39
+ end
40
+
41
+ # GET a page using Mechanize and set to current page.
42
+ def scan(url)
43
+ @target_url = url
44
+ @current_page = @agent.get(@target_url)
45
+ end
46
+
47
+ def reset
48
+ set_agent
49
+ @current_page = nil
50
+ @current_element = nil
51
+ @popular_links = []
52
+ @indicator = nil
53
+ end
54
+
55
+ def ignorance_error
56
+ fail "Sorry, either there are no popular links present
57
+ or this bot isn't smart enough to extract this site yet\n"
58
+ end
59
+ end
@@ -0,0 +1,39 @@
1
+ # Adds capability to extract data in an organized format from webpage.
2
+ module Extraction
3
+ ####
4
+ # Extracts titles and hyperlinks from element being examined.
5
+ # If the text is an empty '' it's an <img>.
6
+ # Images are typically duplicate links and ok to skip.
7
+ def extract_links
8
+ puts 'Not enough links to extract' if see_multiple_links? == false
9
+
10
+ @current_element.css('a').each do |a|
11
+ next if a.text == '' || a['href'] == '#'
12
+ title = a.text
13
+ link = a['href']
14
+ hash = {title: title, link: link}
15
+ @popular_links << hash
16
+ end
17
+ end
18
+
19
+ def display_links
20
+ puts "-"*50
21
+ @popular_links.each do |hash|
22
+ hash.each do |k, v|
23
+ puts "#{k.upcase}: #{v}"
24
+ end
25
+ puts
26
+ end
27
+ puts "-"*50
28
+ @popular_links
29
+ end
30
+
31
+ def extract(url)
32
+ reset
33
+ puts "\nExtracting ...\n"
34
+ scan url
35
+ locate_popular_links
36
+ extract_links
37
+ @popular_links.nil? == true ? simple_error : display_links
38
+ end
39
+ end
@@ -0,0 +1,25 @@
1
+ # Adds capability to memorize things such as URLs, etc.
2
+ # Most variables are stored here.
3
+ module Memorization
4
+ # Searches page for link that says Articles or Blog.
5
+ def find_posts_url
6
+ @article_link = @current_page.link_with(text: /Articles/)
7
+ @blog_link = @current_page.link_with(text: /Blog/)
8
+ end
9
+
10
+ # Memorize posts_url found by find_posts.
11
+ def store_posts_url
12
+ @posts_url =
13
+ case
14
+ when @article_link.nil? == false
15
+ @article_link
16
+ when @blog_link.nil? == false
17
+ @blog_link
18
+ end
19
+ end
20
+
21
+ # Sets search indicator to whatever had 'Popular' in its text.
22
+ def store_indicator
23
+ @indicator = @current_page.search("[text()*='Popular']").first
24
+ end
25
+ end
@@ -0,0 +1,62 @@
1
+ # Adds capability to navigate through posts pages.
2
+ module Navigation
3
+ # Navigates to posts page based off of store_posts_url.
4
+ def go_to_posts_page
5
+ find_posts_url
6
+ store_posts_url
7
+ @current_page = @posts_url.click
8
+ end
9
+
10
+ # Navigates to previous Mechanize page.
11
+ def previous_page
12
+ @current_page = @agent.get(@agent.back['href'])
13
+ end
14
+
15
+ # Sets current element to 'Popular' indicator.
16
+ def go_to_popular
17
+ if see_popular? == false
18
+ puts 'Nothing says "Popular" on this page'
19
+ else
20
+ store_indicator
21
+ @current_element = @indicator
22
+ @current_element
23
+ end
24
+ end
25
+
26
+ # Returns parent of current Nokogiri element
27
+ def find_parent
28
+ @parent = @current_element.parent # one element higher, the div container
29
+ end
30
+
31
+ # Changes current Nokogiri element to its parent.
32
+ def ascend
33
+ if @current_element.ancestors.empty? == true # no more room to ascend
34
+ puts 'At highest element. Nothing left to ascend.'
35
+ else
36
+ @current_element = @current_element.parent
37
+ end
38
+ end
39
+
40
+ def auto_ascend
41
+ ascend until see_multiple_links? == true
42
+ @current_element
43
+ end
44
+
45
+ def locate_popular_links
46
+ if possible_success? == false
47
+ ignorance_error
48
+ elsif see_popular? == true
49
+ crawl_popular
50
+ elsif see_posts? == true
51
+ go_to_posts_page
52
+ see_popular? == true ? crawl_popular : ignorance_error
53
+ end
54
+ end
55
+
56
+ # Examine popular element and climb DOM tree until multiple
57
+ # links are present.
58
+ def crawl_popular
59
+ go_to_popular
60
+ auto_ascend
61
+ end
62
+ end
@@ -0,0 +1,47 @@
1
+ # Adds capability to examine for posts, articles, and links.
2
+ module Reflection
3
+ # Searches for link that says articles or blog.
4
+ def see_posts?
5
+ find_posts_url
6
+ if @article_link.nil? == true && @blog_link.nil? == true
7
+ false # => no article links found
8
+ else
9
+ true
10
+ end
11
+ end
12
+
13
+ # Searches for keyword 'Popular' on current page.
14
+ def see_popular?
15
+ search = @current_page.search "[text()*='Popular']"
16
+ search.empty? ? false : true
17
+ end
18
+
19
+ # Searches for presence of <a> tags in current element.
20
+ def see_links?
21
+ @current_element.css('a').empty? ? false : true
22
+ end
23
+
24
+ # Searches for presences of more than two <a> tags in current element.
25
+ def see_multiple_links?
26
+ @current_element.css('a').length < 3 ? false : true
27
+ end
28
+
29
+ ####
30
+ # Determines if bot can crawl a Popular section.
31
+ # If it doesn't see popular at first glance, it searches and navigates
32
+ # to Posts page (Articles or Blog).
33
+ #
34
+ # If Popular section still doesn't exist, it's a no-go.
35
+ def possible_success?
36
+ if see_popular? == true
37
+ true
38
+ elsif see_posts? == true
39
+ go_to_posts_page # Changes page to look for popular.
40
+ answer = see_popular? == true ? true : false # true if popular is present
41
+ previous_page # Check is complete. Return to original page.
42
+ answer
43
+ else
44
+ false
45
+ end
46
+ end
47
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blogbot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-22 00:00:00.000000000 Z
11
+ date: 2015-09-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -37,6 +37,11 @@ files:
37
37
  - lib/blogbot/memorization.rb
38
38
  - lib/blogbot/navigation.rb
39
39
  - lib/blogbot/reflection.rb
40
+ - test/blogbot.rb
41
+ - test/blogbot/extraction.rb
42
+ - test/blogbot/memorization.rb
43
+ - test/blogbot/navigation.rb
44
+ - test/blogbot/reflection.rb
40
45
  homepage: https://github.com/m8ss/blogbot
41
46
  licenses:
42
47
  - MIT
@@ -61,4 +66,9 @@ rubygems_version: 2.4.6
61
66
  signing_key:
62
67
  specification_version: 4
63
68
  summary: The internet is full of noise. Only read the best.
64
- test_files: []
69
+ test_files:
70
+ - test/blogbot.rb
71
+ - test/blogbot/extraction.rb
72
+ - test/blogbot/memorization.rb
73
+ - test/blogbot/navigation.rb
74
+ - test/blogbot/reflection.rb