RubyGems - blogbot - Versions diffs - 0.0.1 → 0.0.3.beta - Mend

blogbot 0.0.1 → 0.0.3.beta

Files changed (9) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 45b0e835da301f5cd85e746d9e2d91cb18b4e50d
-  data.tar.gz: 159a51609d6f1fc12655056e5049328114d16e46
+  metadata.gz: 6c5e8d092c22a8a7b833f4ff9ca2316233f83052
+  data.tar.gz: b40ef4f4c864f045738d6ec5befcc57a2bf0cd26
 SHA512:
-  metadata.gz: 0984803bfe1b234f7ea66c80406636b07b8e9031172993fb13d47799e097c4286721afdd8467eaec391c43eecbcced987d784185606b1857c2e7d327683479f9
-  data.tar.gz: 9609c4fde40bebb62a82232e8665abaa163a83ab36efe84b1182316d887b4ac9b0595f3eb3706664a22910611508e63a55e78b81490b9a283110d2b624f1e6a2
+  metadata.gz: 2ac8cc16f576800bcd01160da49ef160c6905ac3a0a1e109c4b3e619b08878149ef3411eaec67c48f9d96e80716cfc9ad610e1a84935261d856f7d361ebbe1ed
+  data.tar.gz: 6db095cf79251f28abc45b99194b6a80afec1dd921d029459e2bd9d23c8b643c8e75737559d77f49f5e075f33a3b9ccff5a3a3bb341bc8c32d65dfba20f5e10b

data/README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 # blogbot
-Bot that crawls the most popular articles from websites.
+Bot that extracts the most popular articles from websites.
 ## DESCRIPTION
-The internet is full of noise.  Only read the best
+The internet is full of noise.  Only read the best.
 ## DISCLAIMER

data/blogbot.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |gem|
   gem.name        = 'blogbot'
-  gem.version     = '0.0.1'
+  gem.version     = '0.0.3.beta'
   gem.date        = '2015-09-22'
   gem.platform = Gem::Platform::RUBY
   gem.required_ruby_version = '>= 1.8'
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
   gem.description = 'Bot that crawls the most popular articles from websites. '
   gem.authors     = ['John Mason']
   gem.email       = 'mace2345@gmail.com'
-  gem.homepage    = 'https://github.com/m8ss/amazon-search'
+  gem.homepage    = 'https://github.com/m8ss/blogbot'
   gem.license       = 'MIT'
   gem.add_runtime_dependency('mechanize', '~> 2.7')

data/lib/blogbot.rb CHANGED Viewed

@@ -1,7 +1,70 @@
 #!/usr/bin/env ruby
 require 'mechanize'
+require './blogbot/extraction'
+require './blogbot/memorization'
+require './blogbot/navigation'
+require './blogbot/reflection'
-# Blogbot module
-module Blogbot
+# MASTER GAME PLAN:
+#
+# 1) accept target site
+# 2) scan for keyword popular
+#   a- if on homepage and nothing comes up, find blogpage then scan again
+#   b- if nothing is coming up, scan sitemap.xml, etc for keyword popular
+# 3) determine div id/class of where keyword popular is located
+# 4) extract all hyperlinks and store in hash
+# 5) index??
+#
+# Make a new blogbot that can scan pages and extract the most popular links!
+class Blogbot
+  include Extraction
+  include Memorization
+  include Navigation
+  include Reflection
+  attr_accessor(
+    :agent,
+    :article_link,
+    :blog_link,
+    :current_element,
+    :current_page,
+    :indicator,
+    :parent,
+    :posts_url,
+    :target_url
+  )
+  def set_agent
+    self.agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
+  end
+  def initialize
+    puts 'Powering up the rubatron generators!!'
+    set_agent
+    @current_page = nil
+    @current_element = nil
+    @popular_links = {}
+    @indicator = nil
+  end
+  # GET a page using Mechanize and set to current page.
+  def scan(url)
+    @target_url = url
+    @current_page = @agent.get(@target_url)
+  end
+  def reset
+    set_agent
+    @current_page = nil
+    @current_element = nil
+    @popular_links = {}
+    @indicator = nil
+  end
+  def ignorance_error
+    raise "Sorry, either there are no popular links present
+          or this bot isn't smart enough to extract this site yet/n"
+  end
 end

data/lib/blogbot/extraction.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# Adds capability to extract data in an organized format from webpage.
+module Extraction
+  ####
+  # Extracts titles and hyperlinks from element being examined.
+  # If the text is an empty '' it's an <img>.
+  # Images are typically duplicate links and ok to skip.
+  def extract_links
+    puts 'Not enough links to extract' if see_multiple_links? == false
+    @current_element.css('a').each do |a|
+      next if a.text == '' || a['href'] == '#'
+      title = a.text
+      link = a['href']
+      @popular_links[link] = title
+    end
+    @popular_links # Returns entire hash.
+  end
+  def extract(url)
+    reset
+    puts "\nExtracting ...\n"
+    scan url
+    locate_popular_links
+    extract_links
+    @popular_links.nil? == true ? simple_error : @popular_links
+  end
+end

data/lib/blogbot/memorization.rb ADDED Viewed

@@ -0,0 +1,25 @@
+# Adds capability to memorize things such as URLs, etc.
+# Most variables are stored here.
+module Memorization
+  # Searches page for link that says Articles or Blog.
+  def find_posts_url
+    @article_link = @current_page.link_with(text: /Articles/)
+    @blog_link = @current_page.link_with(text: /Blog/)
+  end
+  # Memorize posts_url found by find_posts.
+  def store_posts_url
+    @posts_url =
+      case
+      when @article_link.nil? == false
+        @article_link
+      when @blog_link.nil? == false
+        @blog_link
+      end
+  end
+  # Sets search indicator to whatever had 'Popular' in its text.
+  def store_indicator
+    @indicator = @current_page.search("[text()*='Popular']").first
+  end
+end

data/lib/blogbot/navigation.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# Adds capability to navigate through posts pages.
+module Navigation
+  # TODO: add method to auto navigate parents until proper links are present
+  # Navigates to posts page based off of store_posts_url.
+  def go_to_posts_page
+    find_posts_url
+    store_posts_url
+    @current_page = @posts_url.click
+  end
+  # Navigates to previous Mechanize page.
+  def previous_page
+    @current_page = @agent.get(@agent.back['href'])
+  end
+  # Sets current element to 'Popular' indicator.
+  def go_to_popular
+    if see_popular? == false
+      puts 'Nothing says "Popular" on this page'
+    else
+      store_indicator
+      @current_element = @indicator
+    end
+  end
+  # Returns parent of current Nokogiri element
+  def find_parent
+    @parent = @current_element.parent # one element higher, the div container
+  end
+  # Changes current Nokogiri element to its parent.
+  def ascend
+    if @current_element.ancestors.empty? == true # no more room to ascend
+      puts 'At highest element.  Nothing left to ascend.'
+    else
+      @current_element = @current_element.parent
+    end
+  end
+  ####
+  # FIXME: this selects the first child and can lead
+  # bot into rabit hole.  Only use when at lower levels of html.
+  # Need to change this to an iteration in the future.
+  #
+  # Changes current Nokogiri element to first child.
+  def descend
+    if @current_element.children.empty? == true # no more room to descend
+      puts 'At lowest element.  Nothing left to descend.'
+    else
+      @current_element = @current_element.child
+    end
+  end
+  def auto_ascend
+    ascend until see_multiple_links? == true
+    @current_element
+  end
+  def locate_popular_links
+    if possible_success? == false
+      ignorance_error
+    elsif see_popular? == true
+      crawl_popular
+    elsif see_posts? == true
+      go_to_posts_page
+      see_popular? == true ? crawl_popular : ignorance_error
+    end
+  end
+  # Examine popular element and climb DOM tree until multiple
+  # links are present.
+  def crawl_popular
+    go_to_popular
+    auto_ascend
+  end
+end

data/lib/blogbot/reflection.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# Adds capability to examine for posts, articles, and links.
+module Reflection
+  # Searches for link that says articles or blog.
+  def see_posts?
+    find_posts_url
+    if @article_link.nil? == true && @blog_link.nil? == true
+      false # => no article links found
+    else
+      true
+    end
+  end
+  # Searches for keyword 'Popular' on current page.
+  def see_popular?
+    search = @current_page.search "[text()*='Popular']"
+    search.empty? ? false : true
+  end
+  # Searches for presence of <a> tags in current element.
+  def see_links?
+    @current_element.css('a').empty? ? false : true
+  end
+  # Searches for presences of more than two <a> tags in current element.
+  def see_multiple_links?
+    @current_element.css('a').length < 3 ? false : true
+  end
+  ####
+  # Determines if bot can crawl a Popular section.
+  # If it doesn't see popular at first glance, it searches and navigates
+  # to Posts page (Articles or Blog).
+  #
+  # If Popular section still doesn't exist, it's a no-go.
+  def possible_success?
+    if see_popular? == true
+      true
+    elsif see_posts? == true
+      go_to_posts_page # Changes page to look for popular.
+      answer =  see_popular? == true ? true : false # true if popular is present
+      previous_page # Check is complete. Return to original page.
+      answer
+    else
+      false
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: blogbot
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.3.beta
 platform: ruby
 authors:
 - John Mason
@@ -33,7 +33,11 @@ files:
 - README.md
 - blogbot.gemspec
 - lib/blogbot.rb
-homepage: https://github.com/m8ss/amazon-search
+- lib/blogbot/extraction.rb
+- lib/blogbot/memorization.rb
+- lib/blogbot/navigation.rb
+- lib/blogbot/reflection.rb
+homepage: https://github.com/m8ss/blogbot
 licenses:
 - MIT
 metadata: {}
@@ -48,9 +52,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '1.8'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - ">"
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 1.3.1
 requirements: []
 rubyforge_project:
 rubygems_version: 2.4.6