blogbot 0.0.1 → 0.0.3.beta

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 45b0e835da301f5cd85e746d9e2d91cb18b4e50d
4
- data.tar.gz: 159a51609d6f1fc12655056e5049328114d16e46
3
+ metadata.gz: 6c5e8d092c22a8a7b833f4ff9ca2316233f83052
4
+ data.tar.gz: b40ef4f4c864f045738d6ec5befcc57a2bf0cd26
5
5
  SHA512:
6
- metadata.gz: 0984803bfe1b234f7ea66c80406636b07b8e9031172993fb13d47799e097c4286721afdd8467eaec391c43eecbcced987d784185606b1857c2e7d327683479f9
7
- data.tar.gz: 9609c4fde40bebb62a82232e8665abaa163a83ab36efe84b1182316d887b4ac9b0595f3eb3706664a22910611508e63a55e78b81490b9a283110d2b624f1e6a2
6
+ metadata.gz: 2ac8cc16f576800bcd01160da49ef160c6905ac3a0a1e109c4b3e619b08878149ef3411eaec67c48f9d96e80716cfc9ad610e1a84935261d856f7d361ebbe1ed
7
+ data.tar.gz: 6db095cf79251f28abc45b99194b6a80afec1dd921d029459e2bd9d23c8b643c8e75737559d77f49f5e075f33a3b9ccff5a3a3bb341bc8c32d65dfba20f5e10b
data/README.md CHANGED
@@ -1,10 +1,10 @@
1
1
  # blogbot
2
2
 
3
- Bot that crawls the most popular articles from websites.
3
+ Bot that extracts the most popular articles from websites.
4
4
 
5
5
  ## DESCRIPTION
6
6
 
7
- The internet is full of noise. Only read the best
7
+ The internet is full of noise. Only read the best.
8
8
 
9
9
  ## DISCLAIMER
10
10
 
data/blogbot.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = 'blogbot'
5
- gem.version = '0.0.1'
5
+ gem.version = '0.0.3.beta'
6
6
  gem.date = '2015-09-22'
7
7
  gem.platform = Gem::Platform::RUBY
8
8
  gem.required_ruby_version = '>= 1.8'
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
14
14
  gem.description = 'Bot that crawls the most popular articles from websites. '
15
15
  gem.authors = ['John Mason']
16
16
  gem.email = 'mace2345@gmail.com'
17
- gem.homepage = 'https://github.com/m8ss/amazon-search'
17
+ gem.homepage = 'https://github.com/m8ss/blogbot'
18
18
  gem.license = 'MIT'
19
19
 
20
20
  gem.add_runtime_dependency('mechanize', '~> 2.7')
data/lib/blogbot.rb CHANGED
@@ -1,7 +1,70 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'mechanize'
4
+ require './blogbot/extraction'
5
+ require './blogbot/memorization'
6
+ require './blogbot/navigation'
7
+ require './blogbot/reflection'
4
8
 
5
- # Blogbot module
6
- module Blogbot
9
+ # MASTER GAME PLAN:
10
+ #
11
+ # 1) accept target site
12
+ # 2) scan for keyword popular
13
+ # a- if on homepage and nothing comes up, find blogpage then scan again
14
+ # b- if nothing is coming up, scan sitemap.xml, etc for keyword popular
15
+ # 3) determine div id/class of where keyword popular is located
16
+ # 4) extract all hyperlinks and store in hash
17
+ # 5) index??
18
+ #
19
+
20
+ # Make a new blogbot that can scan pages and extract the most popular links!
21
+ class Blogbot
22
+ include Extraction
23
+ include Memorization
24
+ include Navigation
25
+ include Reflection
26
+
27
+ attr_accessor(
28
+ :agent,
29
+ :article_link,
30
+ :blog_link,
31
+ :current_element,
32
+ :current_page,
33
+ :indicator,
34
+ :parent,
35
+ :posts_url,
36
+ :target_url
37
+ )
38
+
39
+ def set_agent
40
+ self.agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
41
+ end
42
+
43
+ def initialize
44
+ puts 'Powering up the rubatron generators!!'
45
+ set_agent
46
+ @current_page = nil
47
+ @current_element = nil
48
+ @popular_links = {}
49
+ @indicator = nil
50
+ end
51
+
52
+ # GET a page using Mechanize and set to current page.
53
+ def scan(url)
54
+ @target_url = url
55
+ @current_page = @agent.get(@target_url)
56
+ end
57
+
58
+ def reset
59
+ set_agent
60
+ @current_page = nil
61
+ @current_element = nil
62
+ @popular_links = {}
63
+ @indicator = nil
64
+ end
65
+
66
+ def ignorance_error
67
+ raise "Sorry, either there are no popular links present
68
+ or this bot isn't smart enough to extract this site yet/n"
69
+ end
7
70
  end
@@ -0,0 +1,27 @@
1
+ # Adds capability to extract data in an organized format from webpage.
2
+ module Extraction
3
+ ####
4
+ # Extracts titles and hyperlinks from element being examined.
5
+ # If the text is an empty '' it's an <img>.
6
+ # Images are typically duplicate links and ok to skip.
7
+ def extract_links
8
+ puts 'Not enough links to extract' if see_multiple_links? == false
9
+
10
+ @current_element.css('a').each do |a|
11
+ next if a.text == '' || a['href'] == '#'
12
+ title = a.text
13
+ link = a['href']
14
+ @popular_links[link] = title
15
+ end
16
+ @popular_links # Returns entire hash.
17
+ end
18
+
19
+ def extract(url)
20
+ reset
21
+ puts "\nExtracting ...\n"
22
+ scan url
23
+ locate_popular_links
24
+ extract_links
25
+ @popular_links.nil? == true ? simple_error : @popular_links
26
+ end
27
+ end
@@ -0,0 +1,25 @@
1
+ # Adds capability to memorize things such as URLs, etc.
2
+ # Most variables are stored here.
3
+ module Memorization
4
+ # Searches page for link that says Articles or Blog.
5
+ def find_posts_url
6
+ @article_link = @current_page.link_with(text: /Articles/)
7
+ @blog_link = @current_page.link_with(text: /Blog/)
8
+ end
9
+
10
+ # Memorize posts_url found by find_posts.
11
+ def store_posts_url
12
+ @posts_url =
13
+ case
14
+ when @article_link.nil? == false
15
+ @article_link
16
+ when @blog_link.nil? == false
17
+ @blog_link
18
+ end
19
+ end
20
+
21
+ # Sets search indicator to whatever had 'Popular' in its text.
22
+ def store_indicator
23
+ @indicator = @current_page.search("[text()*='Popular']").first
24
+ end
25
+ end
@@ -0,0 +1,77 @@
1
+ # Adds capability to navigate through posts pages.
2
+ module Navigation
3
+ # TODO: add method to auto navigate parents until proper links are present
4
+
5
+ # Navigates to posts page based off of store_posts_url.
6
+ def go_to_posts_page
7
+ find_posts_url
8
+ store_posts_url
9
+ @current_page = @posts_url.click
10
+ end
11
+
12
+ # Navigates to previous Mechanize page.
13
+ def previous_page
14
+ @current_page = @agent.get(@agent.back['href'])
15
+ end
16
+
17
+ # Sets current element to 'Popular' indicator.
18
+ def go_to_popular
19
+ if see_popular? == false
20
+ puts 'Nothing says "Popular" on this page'
21
+ else
22
+ store_indicator
23
+ @current_element = @indicator
24
+ end
25
+ end
26
+
27
+ # Returns parent of current Nokogiri element
28
+ def find_parent
29
+ @parent = @current_element.parent # one element higher, the div container
30
+ end
31
+
32
+ # Changes current Nokogiri element to its parent.
33
+ def ascend
34
+ if @current_element.ancestors.empty? == true # no more room to ascend
35
+ puts 'At highest element. Nothing left to ascend.'
36
+ else
37
+ @current_element = @current_element.parent
38
+ end
39
+ end
40
+
41
+ ####
42
+ # FIXME: this selects the first child and can lead
43
+ # bot into rabit hole. Only use when at lower levels of html.
44
+ # Need to change this to an iteration in the future.
45
+ #
46
+ # Changes current Nokogiri element to first child.
47
+ def descend
48
+ if @current_element.children.empty? == true # no more room to descend
49
+ puts 'At lowest element. Nothing left to descend.'
50
+ else
51
+ @current_element = @current_element.child
52
+ end
53
+ end
54
+
55
+ def auto_ascend
56
+ ascend until see_multiple_links? == true
57
+ @current_element
58
+ end
59
+
60
+ def locate_popular_links
61
+ if possible_success? == false
62
+ ignorance_error
63
+ elsif see_popular? == true
64
+ crawl_popular
65
+ elsif see_posts? == true
66
+ go_to_posts_page
67
+ see_popular? == true ? crawl_popular : ignorance_error
68
+ end
69
+ end
70
+
71
+ # Examine popular element and climb DOM tree until multiple
72
+ # links are present.
73
+ def crawl_popular
74
+ go_to_popular
75
+ auto_ascend
76
+ end
77
+ end
@@ -0,0 +1,47 @@
1
+ # Adds capability to examine for posts, articles, and links.
2
+ module Reflection
3
+ # Searches for link that says articles or blog.
4
+ def see_posts?
5
+ find_posts_url
6
+ if @article_link.nil? == true && @blog_link.nil? == true
7
+ false # => no article links found
8
+ else
9
+ true
10
+ end
11
+ end
12
+
13
+ # Searches for keyword 'Popular' on current page.
14
+ def see_popular?
15
+ search = @current_page.search "[text()*='Popular']"
16
+ search.empty? ? false : true
17
+ end
18
+
19
+ # Searches for presence of <a> tags in current element.
20
+ def see_links?
21
+ @current_element.css('a').empty? ? false : true
22
+ end
23
+
24
+ # Searches for presences of more than two <a> tags in current element.
25
+ def see_multiple_links?
26
+ @current_element.css('a').length < 3 ? false : true
27
+ end
28
+
29
+ ####
30
+ # Determines if bot can crawl a Popular section.
31
+ # If it doesn't see popular at first glance, it searches and navigates
32
+ # to Posts page (Articles or Blog).
33
+ #
34
+ # If Popular section still doesn't exist, it's a no-go.
35
+ def possible_success?
36
+ if see_popular? == true
37
+ true
38
+ elsif see_posts? == true
39
+ go_to_posts_page # Changes page to look for popular.
40
+ answer = see_popular? == true ? true : false # true if popular is present
41
+ previous_page # Check is complete. Return to original page.
42
+ answer
43
+ else
44
+ false
45
+ end
46
+ end
47
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blogbot
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.3.beta
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
@@ -33,7 +33,11 @@ files:
33
33
  - README.md
34
34
  - blogbot.gemspec
35
35
  - lib/blogbot.rb
36
- homepage: https://github.com/m8ss/amazon-search
36
+ - lib/blogbot/extraction.rb
37
+ - lib/blogbot/memorization.rb
38
+ - lib/blogbot/navigation.rb
39
+ - lib/blogbot/reflection.rb
40
+ homepage: https://github.com/m8ss/blogbot
37
41
  licenses:
38
42
  - MIT
39
43
  metadata: {}
@@ -48,9 +52,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
48
52
  version: '1.8'
49
53
  required_rubygems_version: !ruby/object:Gem::Requirement
50
54
  requirements:
51
- - - ">="
55
+ - - ">"
52
56
  - !ruby/object:Gem::Version
53
- version: '0'
57
+ version: 1.3.1
54
58
  requirements: []
55
59
  rubyforge_project:
56
60
  rubygems_version: 2.4.6