RubyGems - blogbot - Versions diffs - 0.0.1 → 0.0.3.beta - Mend

blogbot 0.0.1 → 0.0.3.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 45b0e835da301f5cd85e746d9e2d91cb18b4e50d
-  data.tar.gz: 159a51609d6f1fc12655056e5049328114d16e46
+  metadata.gz: 6c5e8d092c22a8a7b833f4ff9ca2316233f83052
+  data.tar.gz: b40ef4f4c864f045738d6ec5befcc57a2bf0cd26
 SHA512:
-  metadata.gz: 0984803bfe1b234f7ea66c80406636b07b8e9031172993fb13d47799e097c4286721afdd8467eaec391c43eecbcced987d784185606b1857c2e7d327683479f9
-  data.tar.gz: 9609c4fde40bebb62a82232e8665abaa163a83ab36efe84b1182316d887b4ac9b0595f3eb3706664a22910611508e63a55e78b81490b9a283110d2b624f1e6a2
+  metadata.gz: 2ac8cc16f576800bcd01160da49ef160c6905ac3a0a1e109c4b3e619b08878149ef3411eaec67c48f9d96e80716cfc9ad610e1a84935261d856f7d361ebbe1ed
+  data.tar.gz: 6db095cf79251f28abc45b99194b6a80afec1dd921d029459e2bd9d23c8b643c8e75737559d77f49f5e075f33a3b9ccff5a3a3bb341bc8c32d65dfba20f5e10b

data/README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 # blogbot
-Bot that crawls the most popular articles from websites.
+Bot that extracts the most popular articles from websites.
 ## DESCRIPTION
-The internet is full of noise.  Only read the best
+The internet is full of noise.  Only read the best.
 ## DISCLAIMER

data/blogbot.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |gem|
   gem.name        = 'blogbot'
-  gem.version     = '0.0.1'
+  gem.version     = '0.0.3.beta'
   gem.date        = '2015-09-22'
   gem.platform = Gem::Platform::RUBY
   gem.required_ruby_version = '>= 1.8'
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
   gem.description = 'Bot that crawls the most popular articles from websites. '
   gem.authors     = ['John Mason']
   gem.email       = 'mace2345@gmail.com'
-  gem.homepage    = 'https://github.com/m8ss/amazon-search'
+  gem.homepage    = 'https://github.com/m8ss/blogbot'
   gem.license       = 'MIT'
   gem.add_runtime_dependency('mechanize', '~> 2.7')

data/lib/blogbot.rb CHANGED Viewed

@@ -1,7 +1,70 @@
 #!/usr/bin/env ruby
 require 'mechanize'
+require './blogbot/extraction'
+require './blogbot/memorization'
+require './blogbot/navigation'
+require './blogbot/reflection'
-# Blogbot module
-module Blogbot
+# MASTER GAME PLAN:
+#
+# 1) accept target site
+# 2) scan for keyword popular
+#   a- if on homepage and nothing comes up, find blogpage then scan again
+#   b- if nothing is coming up, scan sitemap.xml, etc for keyword popular
+# 3) determine div id/class of where keyword popular is located
+# 4) extract all hyperlinks and store in hash
+# 5) index??
+#
+# Make a new blogbot that can scan pages and extract the most popular links!
+class Blogbot
+  include Extraction
+  include Memorization
+  include Navigation
+  include Reflection
+  attr_accessor(
+    :agent,
+    :article_link,
+    :blog_link,
+    :current_element,
+    :current_page,
+    :indicator,
+    :parent,
+    :posts_url,
+    :target_url
+  )
+  def set_agent
+    self.agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
+  end
+  def initialize
+    puts 'Powering up the rubatron generators!!'
+    set_agent
+    @current_page = nil
+    @current_element = nil
+    @popular_links = {}
+    @indicator = nil
+  end
+  # GET a page using Mechanize and set to current page.
+  def scan(url)
+    @target_url = url
+    @current_page = @agent.get(@target_url)
+  end
+  def reset
+    set_agent
+    @current_page = nil
+    @current_element = nil
+    @popular_links = {}
+    @indicator = nil
+  end
+  def ignorance_error
+    raise "Sorry, either there are no popular links present
+          or this bot isn't smart enough to extract this site yet/n"
+  end
 end

data/lib/blogbot/extraction.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# Adds capability to extract data in an organized format from webpage.
+module Extraction
+  ####
+  # Extracts titles and hyperlinks from element being examined.
+  # If the text is an empty '' it's an <img>.
+  # Images are typically duplicate links and ok to skip.
+  def extract_links
+    puts 'Not enough links to extract' if see_multiple_links? == false
+    @current_element.css('a').each do |a|
+      next if a.text == '' || a['href'] == '#'
+      title = a.text
+      link = a['href']
+      @popular_links[link] = title
+    end
+    @popular_links # Returns entire hash.
+  end
+  def extract(url)
+    reset
+    puts "\nExtracting ...\n"
+    scan url
+    locate_popular_links
+    extract_links
+    @popular_links.nil? == true ? simple_error : @popular_links
+  end
+end

data/lib/blogbot/memorization.rb ADDED Viewed

@@ -0,0 +1,25 @@
+# Adds capability to memorize things such as URLs, etc.
+# Most variables are stored here.
+module Memorization
+  # Searches page for link that says Articles or Blog.
+  def find_posts_url
+    @article_link = @current_page.link_with(text: /Articles/)
+    @blog_link = @current_page.link_with(text: /Blog/)
+  end
+  # Memorize posts_url found by find_posts.
+  def store_posts_url
+    @posts_url =
+      case
+      when @article_link.nil? == false
+        @article_link
+      when @blog_link.nil? == false
+        @blog_link
+      end
+  end
+  # Sets search indicator to whatever had 'Popular' in its text.
+  def store_indicator
+    @indicator = @current_page.search("[text()*='Popular']").first
+  end
+end

data/lib/blogbot/navigation.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# Adds capability to navigate through posts pages.
+module Navigation
+  # TODO: add method to auto navigate parents until proper links are present
+  # Navigates to posts page based off of store_posts_url.
+  def go_to_posts_page
+    find_posts_url
+    store_posts_url
+    @current_page = @posts_url.click
+  end
+  # Navigates to previous Mechanize page.
+  def previous_page
+    @current_page = @agent.get(@agent.back['href'])
+  end
+  # Sets current element to 'Popular' indicator.
+  def go_to_popular
+    if see_popular? == false
+      puts 'Nothing says "Popular" on this page'
+    else
+      store_indicator
+      @current_element = @indicator
+    end
+  end
+  # Returns parent of current Nokogiri element
+  def find_parent
+    @parent = @current_element.parent # one element higher, the div container
+  end
+  # Changes current Nokogiri element to its parent.
+  def ascend
+    if @current_element.ancestors.empty? == true # no more room to ascend
+      puts 'At highest element.  Nothing left to ascend.'
+    else
+      @current_element = @current_element.parent
+    end
+  end
+  ####
+  # FIXME: this selects the first child and can lead
+  # bot into rabit hole.  Only use when at lower levels of html.
+  # Need to change this to an iteration in the future.
+  #
+  # Changes current Nokogiri element to first child.
+  def descend
+    if @current_element.children.empty? == true # no more room to descend
+      puts 'At lowest element.  Nothing left to descend.'
+    else
+      @current_element = @current_element.child
+    end
+  end
+  def auto_ascend
+    ascend until see_multiple_links? == true
+    @current_element
+  end
+  def locate_popular_links
+    if possible_success? == false
+      ignorance_error
+    elsif see_popular? == true
+      crawl_popular
+    elsif see_posts? == true
+      go_to_posts_page
+      see_popular? == true ? crawl_popular : ignorance_error
+    end
+  end
+  # Examine popular element and climb DOM tree until multiple
+  # links are present.
+  def crawl_popular
+    go_to_popular
+    auto_ascend
+  end
+end

data/lib/blogbot/reflection.rb ADDED Viewed

@@ -0,0 +1,47 @@
+# Adds capability to examine for posts, articles, and links.
+module Reflection
+  # Searches for link that says articles or blog.
+  def see_posts?
+    find_posts_url
+    if @article_link.nil? == true && @blog_link.nil? == true
+      false # => no article links found
+    else
+      true
+    end
+  end
+  # Searches for keyword 'Popular' on current page.
+  def see_popular?
+    search = @current_page.search "[text()*='Popular']"
+    search.empty? ? false : true
+  end
+  # Searches for presence of <a> tags in current element.
+  def see_links?
+    @current_element.css('a').empty? ? false : true
+  end
+  # Searches for presences of more than two <a> tags in current element.
+  def see_multiple_links?
+    @current_element.css('a').length < 3 ? false : true
+  end
+  ####
+  # Determines if bot can crawl a Popular section.
+  # If it doesn't see popular at first glance, it searches and navigates
+  # to Posts page (Articles or Blog).
+  #
+  # If Popular section still doesn't exist, it's a no-go.
+  def possible_success?
+    if see_popular? == true
+      true
+    elsif see_posts? == true
+      go_to_posts_page # Changes page to look for popular.
+      answer =  see_popular? == true ? true : false # true if popular is present
+      previous_page # Check is complete. Return to original page.
+      answer
+    else
+      false
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: blogbot
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.3.beta
 platform: ruby
 authors:
 - John Mason
@@ -33,7 +33,11 @@ files:
 - README.md
 - blogbot.gemspec
 - lib/blogbot.rb
-homepage: https://github.com/m8ss/amazon-search
+- lib/blogbot/extraction.rb
+- lib/blogbot/memorization.rb
+- lib/blogbot/navigation.rb
+- lib/blogbot/reflection.rb
+homepage: https://github.com/m8ss/blogbot
 licenses:
 - MIT
 metadata: {}
@@ -48,9 +52,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '1.8'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - ">"
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 1.3.1
 requirements: []
 rubyforge_project:
 rubygems_version: 2.4.6