blogbot 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 89e4fdb5e3c981f8fc1eba19c63b2643870ff903
4
- data.tar.gz: 637a092abf302128fc5bdd3bfddb04f599b44978
3
+ metadata.gz: 5ba38caa6667e36e2396162590ecf399cc604a3a
4
+ data.tar.gz: acd1158109ad0e5c749e2ba8083a3a28f51ba6af
5
5
  SHA512:
6
- metadata.gz: cfcfc8c1207fc3946302f67d3445668f2ef418dc8c3619a63cd1ae70fcec96c836cab2123d17949bc28c60f1ee2240f2a1f1a269c3ea9db85c0bba60423fa535
7
- data.tar.gz: b4b6f70582599014fa00cad198e15493573350d3db99d3842e657898ac72e19df54e30b7e36ee465230ad93a3632578337ffc36e304b5acbbd6091ee10124ade
6
+ metadata.gz: 41f3cd5d74c3fa5ab90d3bbc4453add4c59216d49985b248701303c123c71b9c12f3e95db9ec8ce18fc06bd2fe3a1dbbd6fea458de8fa46b1024297fd3c6569c
7
+ data.tar.gz: 1b69e32e940fa81e48f5e13b6339859f1dd0d32f317e5f42f71b94a3465899918ef66bed864bbf84751a06b90806819aa1064eabf885caaa8aa47344a90f05c8
@@ -2,8 +2,8 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = 'blogbot'
5
- gem.version = '0.0.4'
6
- gem.date = '2015-09-22'
5
+ gem.version = '0.0.5'
6
+ gem.date = '2015-09-25'
7
7
  gem.platform = Gem::Platform::RUBY
8
8
  gem.required_ruby_version = '>= 1.8'
9
9
 
@@ -6,17 +6,6 @@ require 'blogbot/memorization'
6
6
  require 'blogbot/navigation'
7
7
  require 'blogbot/reflection'
8
8
 
9
- # MASTER GAME PLAN:
10
- #
11
- # 1) accept target site
12
- # 2) scan for keyword popular
13
- # a- if on homepage and nothing comes up, find blogpage then scan again
14
- # b- if nothing is coming up, scan sitemap.xml, etc for keyword popular
15
- # 3) determine div id/class of where keyword popular is located
16
- # 4) extract all hyperlinks and store in hash
17
- # 5) index??
18
- #
19
-
20
9
  # Make a new blogbot that can scan pages and extract the most popular links!
21
10
  class Blogbot
22
11
  include Extraction
@@ -45,7 +34,7 @@ class Blogbot
45
34
  set_agent
46
35
  @current_page = nil
47
36
  @current_element = nil
48
- @popular_links = {}
37
+ @popular_links = []
49
38
  @indicator = nil
50
39
  end
51
40
 
@@ -59,12 +48,12 @@ class Blogbot
59
48
  set_agent
60
49
  @current_page = nil
61
50
  @current_element = nil
62
- @popular_links = {}
51
+ @popular_links = []
63
52
  @indicator = nil
64
53
  end
65
54
 
66
55
  def ignorance_error
67
- raise "Sorry, either there are no popular links present
68
- or this bot isn't smart enough to extract this site yet/n"
56
+ fail "Sorry, either there are no popular links present
57
+ or this bot isn't smart enough to extract this site yet\n"
69
58
  end
70
59
  end
@@ -8,12 +8,24 @@ module Extraction
8
8
  puts 'Not enough links to extract' if see_multiple_links? == false
9
9
 
10
10
  @current_element.css('a').each do |a|
11
- next if a.text == '' || a['href'] == '#'
11
+ next if a.text == '' || a['href'] == '#'
12
12
  title = a.text
13
13
  link = a['href']
14
- @popular_links[link] = title
14
+ hash = {title: title, link: link}
15
+ @popular_links << hash
15
16
  end
16
- @popular_links # Returns entire hash.
17
+ end
18
+
19
+ def display_links
20
+ puts "-"*50
21
+ @popular_links.each do |hash|
22
+ hash.each do |k, v|
23
+ puts "#{k.upcase}: #{v}"
24
+ end
25
+ puts
26
+ end
27
+ puts "-"*50
28
+ @popular_links
17
29
  end
18
30
 
19
31
  def extract(url)
@@ -21,7 +33,7 @@ module Extraction
21
33
  puts "\nExtracting ...\n"
22
34
  scan url
23
35
  locate_popular_links
24
- extract_links
25
- @popular_links.nil? == true ? simple_error : @popular_links
36
+ extract_links
37
+ @popular_links.nil? == true ? simple_error : display_links
26
38
  end
27
39
  end
@@ -1,14 +1,12 @@
1
1
  # Adds capability to navigate through posts pages.
2
2
  module Navigation
3
- # TODO: add method to auto navigate parents until proper links are present
4
-
5
3
  # Navigates to posts page based off of store_posts_url.
6
4
  def go_to_posts_page
7
5
  find_posts_url
8
6
  store_posts_url
9
7
  @current_page = @posts_url.click
10
8
  end
11
-
9
+
12
10
  # Navigates to previous Mechanize page.
13
11
  def previous_page
14
12
  @current_page = @agent.get(@agent.back['href'])
@@ -21,6 +19,7 @@ module Navigation
21
19
  else
22
20
  store_indicator
23
21
  @current_element = @indicator
22
+ @current_element
24
23
  end
25
24
  end
26
25
 
@@ -38,25 +37,11 @@ module Navigation
38
37
  end
39
38
  end
40
39
 
41
- ####
42
- # FIXME: this selects the first child and can lead
43
- # bot into rabit hole. Only use when at lower levels of html.
44
- # Need to change this to an iteration in the future.
45
- #
46
- # Changes current Nokogiri element to first child.
47
- def descend
48
- if @current_element.children.empty? == true # no more room to descend
49
- puts 'At lowest element. Nothing left to descend.'
50
- else
51
- @current_element = @current_element.child
52
- end
53
- end
54
-
55
40
  def auto_ascend
56
41
  ascend until see_multiple_links? == true
57
42
  @current_element
58
43
  end
59
-
44
+
60
45
  def locate_popular_links
61
46
  if possible_success? == false
62
47
  ignorance_error
@@ -67,8 +52,8 @@ module Navigation
67
52
  see_popular? == true ? crawl_popular : ignorance_error
68
53
  end
69
54
  end
70
-
71
- # Examine popular element and climb DOM tree until multiple
55
+
56
+ # Examine popular element and climb DOM tree until multiple
72
57
  # links are present.
73
58
  def crawl_popular
74
59
  go_to_popular
@@ -25,7 +25,7 @@ module Reflection
25
25
  def see_multiple_links?
26
26
  @current_element.css('a').length < 3 ? false : true
27
27
  end
28
-
28
+
29
29
  ####
30
30
  # Determines if bot can crawl a Popular section.
31
31
  # If it doesn't see popular at first glance, it searches and navigates
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'mechanize'
4
+ require 'blogbot/extraction'
5
+ require 'blogbot/memorization'
6
+ require 'blogbot/navigation'
7
+ require 'blogbot/reflection'
8
+
9
+ # Make a new blogbot that can scan pages and extract the most popular links!
10
+ class Blogbot
11
+ include Extraction
12
+ include Memorization
13
+ include Navigation
14
+ include Reflection
15
+
16
+ attr_accessor(
17
+ :agent,
18
+ :article_link,
19
+ :blog_link,
20
+ :current_element,
21
+ :current_page,
22
+ :indicator,
23
+ :parent,
24
+ :posts_url,
25
+ :target_url
26
+ )
27
+
28
+ def set_agent
29
+ self.agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
30
+ end
31
+
32
+ def initialize
33
+ puts 'Powering up the rubatron generators!!'
34
+ set_agent
35
+ @current_page = nil
36
+ @current_element = nil
37
+ @popular_links = []
38
+ @indicator = nil
39
+ end
40
+
41
+ # GET a page using Mechanize and set to current page.
42
+ def scan(url)
43
+ @target_url = url
44
+ @current_page = @agent.get(@target_url)
45
+ end
46
+
47
+ def reset
48
+ set_agent
49
+ @current_page = nil
50
+ @current_element = nil
51
+ @popular_links = []
52
+ @indicator = nil
53
+ end
54
+
55
+ def ignorance_error
56
+ fail "Sorry, either there are no popular links present
57
+ or this bot isn't smart enough to extract this site yet\n"
58
+ end
59
+ end
@@ -0,0 +1,39 @@
1
+ # Adds capability to extract data in an organized format from webpage.
2
+ module Extraction
3
+ ####
4
+ # Extracts titles and hyperlinks from element being examined.
5
+ # If the text is an empty '' it's an <img>.
6
+ # Images are typically duplicate links and ok to skip.
7
+ def extract_links
8
+ puts 'Not enough links to extract' if see_multiple_links? == false
9
+
10
+ @current_element.css('a').each do |a|
11
+ next if a.text == '' || a['href'] == '#'
12
+ title = a.text
13
+ link = a['href']
14
+ hash = {title: title, link: link}
15
+ @popular_links << hash
16
+ end
17
+ end
18
+
19
+ def display_links
20
+ puts "-"*50
21
+ @popular_links.each do |hash|
22
+ hash.each do |k, v|
23
+ puts "#{k.upcase}: #{v}"
24
+ end
25
+ puts
26
+ end
27
+ puts "-"*50
28
+ @popular_links
29
+ end
30
+
31
+ def extract(url)
32
+ reset
33
+ puts "\nExtracting ...\n"
34
+ scan url
35
+ locate_popular_links
36
+ extract_links
37
+ @popular_links.nil? == true ? simple_error : display_links
38
+ end
39
+ end
@@ -0,0 +1,25 @@
1
+ # Adds capability to memorize things such as URLs, etc.
2
+ # Most variables are stored here.
3
+ module Memorization
4
+ # Searches page for link that says Articles or Blog.
5
+ def find_posts_url
6
+ @article_link = @current_page.link_with(text: /Articles/)
7
+ @blog_link = @current_page.link_with(text: /Blog/)
8
+ end
9
+
10
+ # Memorize posts_url found by find_posts.
11
+ def store_posts_url
12
+ @posts_url =
13
+ case
14
+ when @article_link.nil? == false
15
+ @article_link
16
+ when @blog_link.nil? == false
17
+ @blog_link
18
+ end
19
+ end
20
+
21
+ # Sets search indicator to whatever had 'Popular' in its text.
22
+ def store_indicator
23
+ @indicator = @current_page.search("[text()*='Popular']").first
24
+ end
25
+ end
@@ -0,0 +1,62 @@
1
+ # Adds capability to navigate through posts pages.
2
+ module Navigation
3
+ # Navigates to posts page based off of store_posts_url.
4
+ def go_to_posts_page
5
+ find_posts_url
6
+ store_posts_url
7
+ @current_page = @posts_url.click
8
+ end
9
+
10
+ # Navigates to previous Mechanize page.
11
+ def previous_page
12
+ @current_page = @agent.get(@agent.back['href'])
13
+ end
14
+
15
+ # Sets current element to 'Popular' indicator.
16
+ def go_to_popular
17
+ if see_popular? == false
18
+ puts 'Nothing says "Popular" on this page'
19
+ else
20
+ store_indicator
21
+ @current_element = @indicator
22
+ @current_element
23
+ end
24
+ end
25
+
26
+ # Returns parent of current Nokogiri element
27
+ def find_parent
28
+ @parent = @current_element.parent # one element higher, the div container
29
+ end
30
+
31
+ # Changes current Nokogiri element to its parent.
32
+ def ascend
33
+ if @current_element.ancestors.empty? == true # no more room to ascend
34
+ puts 'At highest element. Nothing left to ascend.'
35
+ else
36
+ @current_element = @current_element.parent
37
+ end
38
+ end
39
+
40
+ def auto_ascend
41
+ ascend until see_multiple_links? == true
42
+ @current_element
43
+ end
44
+
45
+ def locate_popular_links
46
+ if possible_success? == false
47
+ ignorance_error
48
+ elsif see_popular? == true
49
+ crawl_popular
50
+ elsif see_posts? == true
51
+ go_to_posts_page
52
+ see_popular? == true ? crawl_popular : ignorance_error
53
+ end
54
+ end
55
+
56
+ # Examine popular element and climb DOM tree until multiple
57
+ # links are present.
58
+ def crawl_popular
59
+ go_to_popular
60
+ auto_ascend
61
+ end
62
+ end
@@ -0,0 +1,47 @@
1
+ # Adds capability to examine for posts, articles, and links.
2
+ module Reflection
3
+ # Searches for link that says articles or blog.
4
+ def see_posts?
5
+ find_posts_url
6
+ if @article_link.nil? == true && @blog_link.nil? == true
7
+ false # => no article links found
8
+ else
9
+ true
10
+ end
11
+ end
12
+
13
+ # Searches for keyword 'Popular' on current page.
14
+ def see_popular?
15
+ search = @current_page.search "[text()*='Popular']"
16
+ search.empty? ? false : true
17
+ end
18
+
19
+ # Searches for presence of <a> tags in current element.
20
+ def see_links?
21
+ @current_element.css('a').empty? ? false : true
22
+ end
23
+
24
+ # Searches for presences of more than two <a> tags in current element.
25
+ def see_multiple_links?
26
+ @current_element.css('a').length < 3 ? false : true
27
+ end
28
+
29
+ ####
30
+ # Determines if bot can crawl a Popular section.
31
+ # If it doesn't see popular at first glance, it searches and navigates
32
+ # to Posts page (Articles or Blog).
33
+ #
34
+ # If Popular section still doesn't exist, it's a no-go.
35
+ def possible_success?
36
+ if see_popular? == true
37
+ true
38
+ elsif see_posts? == true
39
+ go_to_posts_page # Changes page to look for popular.
40
+ answer = see_popular? == true ? true : false # true if popular is present
41
+ previous_page # Check is complete. Return to original page.
42
+ answer
43
+ else
44
+ false
45
+ end
46
+ end
47
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blogbot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-22 00:00:00.000000000 Z
11
+ date: 2015-09-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -37,6 +37,11 @@ files:
37
37
  - lib/blogbot/memorization.rb
38
38
  - lib/blogbot/navigation.rb
39
39
  - lib/blogbot/reflection.rb
40
+ - test/blogbot.rb
41
+ - test/blogbot/extraction.rb
42
+ - test/blogbot/memorization.rb
43
+ - test/blogbot/navigation.rb
44
+ - test/blogbot/reflection.rb
40
45
  homepage: https://github.com/m8ss/blogbot
41
46
  licenses:
42
47
  - MIT
@@ -61,4 +66,9 @@ rubygems_version: 2.4.6
61
66
  signing_key:
62
67
  specification_version: 4
63
68
  summary: The internet is full of noise. Only read the best.
64
- test_files: []
69
+ test_files:
70
+ - test/blogbot.rb
71
+ - test/blogbot/extraction.rb
72
+ - test/blogbot/memorization.rb
73
+ - test/blogbot/navigation.rb
74
+ - test/blogbot/reflection.rb