blogbot 0.0.1 → 0.0.3.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 45b0e835da301f5cd85e746d9e2d91cb18b4e50d
4
- data.tar.gz: 159a51609d6f1fc12655056e5049328114d16e46
3
+ metadata.gz: 6c5e8d092c22a8a7b833f4ff9ca2316233f83052
4
+ data.tar.gz: b40ef4f4c864f045738d6ec5befcc57a2bf0cd26
5
5
  SHA512:
6
- metadata.gz: 0984803bfe1b234f7ea66c80406636b07b8e9031172993fb13d47799e097c4286721afdd8467eaec391c43eecbcced987d784185606b1857c2e7d327683479f9
7
- data.tar.gz: 9609c4fde40bebb62a82232e8665abaa163a83ab36efe84b1182316d887b4ac9b0595f3eb3706664a22910611508e63a55e78b81490b9a283110d2b624f1e6a2
6
+ metadata.gz: 2ac8cc16f576800bcd01160da49ef160c6905ac3a0a1e109c4b3e619b08878149ef3411eaec67c48f9d96e80716cfc9ad610e1a84935261d856f7d361ebbe1ed
7
+ data.tar.gz: 6db095cf79251f28abc45b99194b6a80afec1dd921d029459e2bd9d23c8b643c8e75737559d77f49f5e075f33a3b9ccff5a3a3bb341bc8c32d65dfba20f5e10b
data/README.md CHANGED
@@ -1,10 +1,10 @@
1
1
  # blogbot
2
2
 
3
- Bot that crawls the most popular articles from websites.
3
+ Bot that extracts the most popular articles from websites.
4
4
 
5
5
  ## DESCRIPTION
6
6
 
7
- The internet is full of noise. Only read the best
7
+ The internet is full of noise. Only read the best.
8
8
 
9
9
  ## DISCLAIMER
10
10
 
data/blogbot.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = 'blogbot'
5
- gem.version = '0.0.1'
5
+ gem.version = '0.0.3.beta'
6
6
  gem.date = '2015-09-22'
7
7
  gem.platform = Gem::Platform::RUBY
8
8
  gem.required_ruby_version = '>= 1.8'
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
14
14
  gem.description = 'Bot that crawls the most popular articles from websites. '
15
15
  gem.authors = ['John Mason']
16
16
  gem.email = 'mace2345@gmail.com'
17
- gem.homepage = 'https://github.com/m8ss/amazon-search'
17
+ gem.homepage = 'https://github.com/m8ss/blogbot'
18
18
  gem.license = 'MIT'
19
19
 
20
20
  gem.add_runtime_dependency('mechanize', '~> 2.7')
data/lib/blogbot.rb CHANGED
@@ -1,7 +1,70 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'mechanize'
4
+ require './blogbot/extraction'
5
+ require './blogbot/memorization'
6
+ require './blogbot/navigation'
7
+ require './blogbot/reflection'
4
8
 
5
- # Blogbot module
6
- module Blogbot
9
+ # MASTER GAME PLAN:
10
+ #
11
+ # 1) accept target site
12
+ # 2) scan for keyword popular
13
+ # a- if on homepage and nothing comes up, find blogpage then scan again
14
+ # b- if nothing is coming up, scan sitemap.xml, etc for keyword popular
15
+ # 3) determine div id/class of where keyword popular is located
16
+ # 4) extract all hyperlinks and store in hash
17
+ # 5) index??
18
+ #
19
+
20
+ # Make a new blogbot that can scan pages and extract the most popular links!
21
+ class Blogbot
22
+ include Extraction
23
+ include Memorization
24
+ include Navigation
25
+ include Reflection
26
+
27
+ attr_accessor(
28
+ :agent,
29
+ :article_link,
30
+ :blog_link,
31
+ :current_element,
32
+ :current_page,
33
+ :indicator,
34
+ :parent,
35
+ :posts_url,
36
+ :target_url
37
+ )
38
+
39
+ def set_agent
40
+ self.agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
41
+ end
42
+
43
+ def initialize
44
+ puts 'Powering up the rubatron generators!!'
45
+ set_agent
46
+ @current_page = nil
47
+ @current_element = nil
48
+ @popular_links = {}
49
+ @indicator = nil
50
+ end
51
+
52
+ # GET a page using Mechanize and set to current page.
53
+ def scan(url)
54
+ @target_url = url
55
+ @current_page = @agent.get(@target_url)
56
+ end
57
+
58
+ def reset
59
+ set_agent
60
+ @current_page = nil
61
+ @current_element = nil
62
+ @popular_links = {}
63
+ @indicator = nil
64
+ end
65
+
66
+ def ignorance_error
67
+ raise "Sorry, either there are no popular links present
68
+ or this bot isn't smart enough to extract this site yet/n"
69
+ end
7
70
  end
@@ -0,0 +1,27 @@
1
+ # Adds capability to extract data in an organized format from webpage.
2
+ module Extraction
3
+ ####
4
+ # Extracts titles and hyperlinks from element being examined.
5
+ # If the text is an empty '' it's an <img>.
6
+ # Images are typically duplicate links and ok to skip.
7
+ def extract_links
8
+ puts 'Not enough links to extract' if see_multiple_links? == false
9
+
10
+ @current_element.css('a').each do |a|
11
+ next if a.text == '' || a['href'] == '#'
12
+ title = a.text
13
+ link = a['href']
14
+ @popular_links[link] = title
15
+ end
16
+ @popular_links # Returns entire hash.
17
+ end
18
+
19
+ def extract(url)
20
+ reset
21
+ puts "\nExtracting ...\n"
22
+ scan url
23
+ locate_popular_links
24
+ extract_links
25
+ @popular_links.nil? == true ? simple_error : @popular_links
26
+ end
27
+ end
@@ -0,0 +1,25 @@
1
+ # Adds capability to memorize things such as URLs, etc.
2
+ # Most variables are stored here.
3
+ module Memorization
4
+ # Searches page for link that says Articles or Blog.
5
+ def find_posts_url
6
+ @article_link = @current_page.link_with(text: /Articles/)
7
+ @blog_link = @current_page.link_with(text: /Blog/)
8
+ end
9
+
10
+ # Memorize posts_url found by find_posts.
11
+ def store_posts_url
12
+ @posts_url =
13
+ case
14
+ when @article_link.nil? == false
15
+ @article_link
16
+ when @blog_link.nil? == false
17
+ @blog_link
18
+ end
19
+ end
20
+
21
+ # Sets search indicator to whatever had 'Popular' in its text.
22
+ def store_indicator
23
+ @indicator = @current_page.search("[text()*='Popular']").first
24
+ end
25
+ end
@@ -0,0 +1,77 @@
1
+ # Adds capability to navigate through posts pages.
2
+ module Navigation
3
+ # TODO: add method to auto navigate parents until proper links are present
4
+
5
+ # Navigates to posts page based off of store_posts_url.
6
+ def go_to_posts_page
7
+ find_posts_url
8
+ store_posts_url
9
+ @current_page = @posts_url.click
10
+ end
11
+
12
+ # Navigates to previous Mechanize page.
13
+ def previous_page
14
+ @current_page = @agent.get(@agent.back['href'])
15
+ end
16
+
17
+ # Sets current element to 'Popular' indicator.
18
+ def go_to_popular
19
+ if see_popular? == false
20
+ puts 'Nothing says "Popular" on this page'
21
+ else
22
+ store_indicator
23
+ @current_element = @indicator
24
+ end
25
+ end
26
+
27
+ # Returns parent of current Nokogiri element
28
+ def find_parent
29
+ @parent = @current_element.parent # one element higher, the div container
30
+ end
31
+
32
+ # Changes current Nokogiri element to its parent.
33
+ def ascend
34
+ if @current_element.ancestors.empty? == true # no more room to ascend
35
+ puts 'At highest element. Nothing left to ascend.'
36
+ else
37
+ @current_element = @current_element.parent
38
+ end
39
+ end
40
+
41
+ ####
42
+ # FIXME: this selects the first child and can lead
43
+ # bot into rabit hole. Only use when at lower levels of html.
44
+ # Need to change this to an iteration in the future.
45
+ #
46
+ # Changes current Nokogiri element to first child.
47
+ def descend
48
+ if @current_element.children.empty? == true # no more room to descend
49
+ puts 'At lowest element. Nothing left to descend.'
50
+ else
51
+ @current_element = @current_element.child
52
+ end
53
+ end
54
+
55
+ def auto_ascend
56
+ ascend until see_multiple_links? == true
57
+ @current_element
58
+ end
59
+
60
+ def locate_popular_links
61
+ if possible_success? == false
62
+ ignorance_error
63
+ elsif see_popular? == true
64
+ crawl_popular
65
+ elsif see_posts? == true
66
+ go_to_posts_page
67
+ see_popular? == true ? crawl_popular : ignorance_error
68
+ end
69
+ end
70
+
71
+ # Examine popular element and climb DOM tree until multiple
72
+ # links are present.
73
+ def crawl_popular
74
+ go_to_popular
75
+ auto_ascend
76
+ end
77
+ end
@@ -0,0 +1,47 @@
1
+ # Adds capability to examine for posts, articles, and links.
2
+ module Reflection
3
+ # Searches for link that says articles or blog.
4
+ def see_posts?
5
+ find_posts_url
6
+ if @article_link.nil? == true && @blog_link.nil? == true
7
+ false # => no article links found
8
+ else
9
+ true
10
+ end
11
+ end
12
+
13
+ # Searches for keyword 'Popular' on current page.
14
+ def see_popular?
15
+ search = @current_page.search "[text()*='Popular']"
16
+ search.empty? ? false : true
17
+ end
18
+
19
+ # Searches for presence of <a> tags in current element.
20
+ def see_links?
21
+ @current_element.css('a').empty? ? false : true
22
+ end
23
+
24
+ # Searches for presences of more than two <a> tags in current element.
25
+ def see_multiple_links?
26
+ @current_element.css('a').length < 3 ? false : true
27
+ end
28
+
29
+ ####
30
+ # Determines if bot can crawl a Popular section.
31
+ # If it doesn't see popular at first glance, it searches and navigates
32
+ # to Posts page (Articles or Blog).
33
+ #
34
+ # If Popular section still doesn't exist, it's a no-go.
35
+ def possible_success?
36
+ if see_popular? == true
37
+ true
38
+ elsif see_posts? == true
39
+ go_to_posts_page # Changes page to look for popular.
40
+ answer = see_popular? == true ? true : false # true if popular is present
41
+ previous_page # Check is complete. Return to original page.
42
+ answer
43
+ else
44
+ false
45
+ end
46
+ end
47
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blogbot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.3.beta
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
@@ -33,7 +33,11 @@ files:
33
33
  - README.md
34
34
  - blogbot.gemspec
35
35
  - lib/blogbot.rb
36
- homepage: https://github.com/m8ss/amazon-search
36
+ - lib/blogbot/extraction.rb
37
+ - lib/blogbot/memorization.rb
38
+ - lib/blogbot/navigation.rb
39
+ - lib/blogbot/reflection.rb
40
+ homepage: https://github.com/m8ss/blogbot
37
41
  licenses:
38
42
  - MIT
39
43
  metadata: {}
@@ -48,9 +52,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
48
52
  version: '1.8'
49
53
  required_rubygems_version: !ruby/object:Gem::Requirement
50
54
  requirements:
51
- - - ">="
55
+ - - ">"
52
56
  - !ruby/object:Gem::Version
53
- version: '0'
57
+ version: 1.3.1
54
58
  requirements: []
55
59
  rubyforge_project:
56
60
  rubygems_version: 2.4.6