RubyGems - search_logger - Versions diffs - 0.0.1 - Mend

search_logger 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

data/.rspec +1 -0
data/.travis.yml +6 -0
data/Gemfile +3 -0
data/Gemfile.lock +36 -0
data/Guardfile +9 -0
data/README +0 -0
data/Rakefile +1 -0
data/bin/search_logger +12 -0
data/lib/search_logger/csv_exporter.rb +20 -0
data/lib/search_logger/exec.rb +131 -0
data/lib/search_logger/google_parser/result.rb +60 -0
data/lib/search_logger/google_parser.rb +59 -0
data/lib/search_logger/persistence.rb +68 -0
data/lib/search_logger/version.rb +3 -0
data/lib/search_logger/xml_parser.rb +15 -0
data/lib/search_logger.rb +11 -0
data/schema.sql +15 -0
data/search_logger.gemspec +26 -0
data/spec/acceptance/google_parser_acceptance_spec.rb +93 -0
data/spec/acceptance/mysql_to_csv_exportation_acceptance_spec.rb +39 -0
data/spec/acceptance/persistence_acceptance_spec.rb +26 -0
data/spec/acceptance/test_acceptance_spec.rb +61 -0
data/spec/acceptance/xml_parser_acceptance_spec.rb +10 -0
data/spec/spec_helper.rb +11 -0
data/spec/support/file_repository/exported_from_persistence.csv +119 -0
data/spec/support/file_repository/google_result.html +115 -0
data/spec/support/file_repository/google_result_2.html +636 -0
data/spec/support/file_repository/rankabove_test.xml +9 -0
data/spec/support/file_repository/sample_mysql_data.rb +6 -0
data/spec/unit/google_parser/google_parser_result_spec.rb +79 -0
data/spec/unit/google_parser_spec.rb +69 -0
data/spec/unit/persistence_spec.rb +86 -0
data/spec/unit/shell_exec_spec.rb +28 -0
data/spec/unit/xml_parser_spec.rb +22 -0
metadata +152 -0

data/.rspec ADDED Viewed

	@@ -0,0 +1 @@
1	+ --colour

data/.travis.yml ADDED Viewed

@@ -0,0 +1,6 @@
+language: ruby
+rvm:
+  - 1.9.2
+  - 1.9.3
+# uncomment this line if your project needs to run something other than `rake`:
+script: bundle exec rspec spec

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source "http://rubygems.org"
+gemspec

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,36 @@
+PATH
+  remote: .
+  specs:
+    search_logger (0.0.1)
+      httpclient
+      mysql2
+      nokogiri
+GEM
+  remote: http://rubygems.org/
+  specs:
+    diff-lcs (1.1.3)
+    ffi (1.0.11)
+    guard (1.0.0)
+      ffi (>= 0.5.0)
+      thor (~> 0.14.6)
+    httpclient (2.2.4)
+    mysql2 (0.3.11)
+    nokogiri (1.5.0)
+    rspec (2.8.0)
+      rspec-core (~> 2.8.0)
+      rspec-expectations (~> 2.8.0)
+      rspec-mocks (~> 2.8.0)
+    rspec-core (2.8.0)
+    rspec-expectations (2.8.0)
+      diff-lcs (~> 1.1.2)
+    rspec-mocks (2.8.0)
+    thor (0.14.6)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  guard
+  rspec
+  search_logger!

data/Guardfile ADDED Viewed

@@ -0,0 +1,9 @@
+# A sample Guardfile
+# More info at https://github.com/guard/guard#readme
+guard 'rspec', :version => 2 do
+  watch(%r{^spec/.+_spec\.rb$})
+  watch(%r{^lib/search_logger/(.+)\.rb$})     { |m| puts "spec/unit/#{m[1]}_spec.rb"; "spec/unit/#{m[1]}_spec.rb" }
+  watch('spec/spec_helper.rb')  { "spec" }
+end

data/README ADDED Viewed

File without changes

data/Rakefile ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "bundler/gem_tasks"

data/bin/search_logger ADDED Viewed

@@ -0,0 +1,12 @@
+#!/usr/bin/ruby
+$:.unshift File.expand_path('../../lib/', __FILE__)
+$:.unshift File.expand_path('../../', __FILE__)
+require "search_logger/exec"
+begin
+  shell = SearchLogger::Exec.new(ARGV).run
+rescue Interrupt
+  system "stty echo"
+  exit
+end

data/lib/search_logger/csv_exporter.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'csv'
+class CSVExporter
+  def export(data, options)
+    @data = data
+    @to = options[:to]
+    save_to_file unless @data.empty?
+  end
+  def save_to_file
+    CSV.open(@to, "wb") do |csv|
+      csv << %w{keyword position url title description}
+      @data.each do |d|
+        csv << [d[:searched_keyword], d[:position], d[:url], d[:title], d[:description]]
+      end
+    end
+  end
+end

data/lib/search_logger/exec.rb ADDED Viewed

@@ -0,0 +1,131 @@
+require "search_logger"
+module SearchLogger
+  class Exec
+    attr_accessor :command
+    attr_reader :argv
+    def initialize argv
+      @argv = argv
+      unless valid_argv?
+        puts "Please, specify a xml file with keywords."
+        puts ""
+        puts "Example:"
+        puts ""
+        puts "\s\ssearch_logger ~/my/folder/keywords.xml"
+        exit
+      end
+      unless valid_file?
+        puts "The file you specified doesn't exist."
+        exit
+      end
+      puts "Please, enter your MySQL database information."
+      asks_for_database_config
+    end
+    def asks_for_database_config
+      database_config = {
+        database: "search_logger",
+        host:     "localhost",
+        username: "root"
+      }
+      print "Host address (defaults to 'localhost'): "
+      input = input_text and !input.empty? and database_config[:host] = input
+      print "Username (defaults to 'root'): "
+      input = input_text and !input.empty? and database_config[:username] = input
+      system "stty -echo"
+      print "Password: "
+      database_config[:password] = input_text
+      system "stty echo"
+      begin
+        @database_connection = SearchLogger::Persistence.new(database_config)
+        puts "\n\nA connection was established, starting operation.\n\n"
+      rescue
+        puts "The specified DB does not exists. Please, try again.\n\n"
+        asks_for_database_config
+      end
+    end
+    def input_text
+      begin
+        STDOUT.flush
+        STDIN.gets.strip
+      rescue
+      end
+    end
+    def valid_argv?
+      @argv.length > 0
+    end
+    def valid_file?
+      File.exists? @argv[0]
+    end
+    def run
+      puts "1) Parsing the XML file"
+      xml = load_xml
+      puts "2) Searching Google and saving to MySQL (first 2 pages, 100 results each)"
+      xml.each do |value|
+        puts "Keyword: #{value.to_s}"
+        print "\s\sGoogle: "
+        google_results = search_google(value)
+        print "\e[0;32mdone.\e[0m "
+        print "\s\sMySQL: "
+        save_into_mysql(google_results)
+        print "\e[0;32mdone.\e[0m\n"
+      end
+      puts ""
+      export_to_csv_file
+      puts "\nCongratulations! Everything worked as expected. Please audit the CSV file to guarantee the quality of the data."
+    end
+    def load_xml
+      xml_parser = SearchLogger::XmlParser.new(@argv.first).parse
+    end
+    def search_google(query_string)
+      source = File.open('spec/support/file_repository/google_result_2.html').read
+      #page_one = SearchLogger::GoogleParser.new.query(query_string).per_page(100).page(1).search
+      #page_two = SearchLogger::GoogleParser.new.query(query_string).per_page(100).page(2).last_result(page_one).search
+      page_one = SearchLogger::GoogleParser.new(source).search
+      page_two = SearchLogger::GoogleParser.new(source).last_result(page_one).search
+      results = []
+      position = 1
+      (page_one + page_two).each do |e|
+        results << e.as_ary
+        position += 1
+      end
+      results
+    end
+    def save_into_mysql(google_results)
+      persistence = @database_connection
+      persistence.data(google_results).table('google_results').save
+    end
+    def export_to_csv_file
+      csv_path = ENV["HOME"] + "/search_logger.csv"
+      csv_file = File.open(csv_path, "wb")
+      print "3) Loading data from MySQL google_results table... "
+      data = @database_connection.table("google_results").load_data
+      print "\e[0;32mdone.\n\e[0m"
+      print "4) Creating CSV file and adding data in #{csv_path}..."
+      File.delete csv_path if File.exists? csv_path
+      CSVExporter.new.export data, to: csv_file
+      print "\e[0;32mdone.\n\e[0m"
+    end
+  end
+end

data/lib/search_logger/google_parser/result.rb ADDED Viewed

@@ -0,0 +1,60 @@
+module SearchLogger
+  class GoogleParser
+    class Result
+      attr_accessor :title, :url, :description, :position, :searched_keyword
+      def initialize(node, position, searched_keyword)
+        raise "No HTML node was specified" if node == false
+        @position, @searched_keyword = position, searched_keyword
+        @title, @url, @description = nil
+        @node = node
+      end
+      def parse
+        parse_normal_result if @node[:id].nil? || @node[:id] =~ /mbb/
+        parse_news_result   if @node[:id] == "newsbox"
+        self
+      end
+      def as_ary
+        { title:            title,
+          url:              url,
+          description:      description,
+          position:         position,
+          searched_keyword: searched_keyword
+        }
+      end
+      def parse_normal_result
+        self.tap do |e|
+          e.title       = sanitize_string @node.at_css('h3 a').content  unless @node.at_css('h3 a').nil?
+          e.url         = sanitize_string @node.at_css('h3 a')[:href]   unless @node.at_css('h3 a').nil?
+          e.description = sanitize_string @node.at_css('div.s').content unless @node.at_css('div.s').nil?
+        end
+      end
+      def parse_news_result
+        self.tap do |e|
+          title_link    = @node.at_css('li.w0 span.tl a')
+          description   = @node.at_css('li.w0 span[dir=ltr]')
+          e.title       = sanitize_string title_link.content  unless title_link.nil?
+          e.url         = sanitize_string title_link[:href]   unless title_link.nil?
+          e.description = sanitize_string description.content unless description.nil?
+        end
+      end
+      def sanitize_string(string)
+        string.gsub(/&amp;/, "&")
+              .gsub(/[\s]{1,99}/, " ")
+              .strip
+              .gsub(/\s\.\.\.[\s]{0,1}[w]{0,3}.*- Cached - Similar/, " ...")
+      end
+      def set_result(options = {})
+        @title        = options[:title]
+        @url          = options[:url]
+        @description  = options[:description]
+      end
+    end
+  end
+end

data/lib/search_logger/google_parser.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module SearchLogger
+  class GoogleParser
+    attr_accessor :query, :result, :position_offset, :start, :num, :rough_query
+    def initialize(result = false)
+      @result = result if result
+      @start, @position_offset = 0, 1
+      @rough_query, @query = "", ""
+      @num = 100
+      @base_url = "https://www.google.com/search?"
+    end
+    # query options
+    def query(query)
+      self.tap { |s| s.query = query.gsub(/\s/, "+"); s.rough_query = query }
+    end
+    def per_page(quantity)
+      self.tap { |s| s.num = quantity}
+    end
+    def page(current_page)
+      self.tap { |s| s.start = ((current_page-1) * (s.num)); s.position_offset = s.start+1 }
+    end
+    def last_result(result)
+      self.tap { |s| s.position_offset = result.last.position + 1 }
+    end
+    def search(result_object = Result)
+      require "nokogiri"
+      Nokogiri::HTML.parse(get_response).css('li.g').each_with_object([]) do |e, all|
+        all << result_object.new(e, @position_offset, @rough_query).parse
+        @position_offset += 1 unless all.empty?
+      end
+    end
+    def url
+      url = @base_url
+      query_strings = []
+      query_strings << "q=#{@query}" if @query
+      query_strings << "num=#{num}"
+      query_strings << "hl=en"
+      query_strings << "start=#{@start}"
+      url += query_strings.join("&")
+      require "uri"
+      url = URI.encode(url)
+    end
+    private
+    def get_response
+      return @result if @result
+      require 'httpclient'
+      clnt = HTTPClient.new.get(url, :follow_redirect => true).body
+    end
+  end
+end

data/lib/search_logger/persistence.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# encoding: utf-8
+require "mysql2"
+module SearchLogger
+  class Persistence
+    attr_accessor :table, :client, :connection_config
+    attr_reader :data
+    def initialize(connection_config = { host: "localhost", username: "root", database: "search_logger" })
+      @data = []
+      @connection_config = connection_config
+      establish_connection
+    end
+    def establish_connection
+      @client = ::Mysql2::Client.new(@connection_config)
+    end
+    # sets up the operation properties
+    def data(data = [])
+      return @data if data.empty?
+      data = [data] if data.is_a?(Hash)
+      @data = data
+      self
+    end
+    def table(table = nil)
+      return @table unless table
+      @table = table
+      self
+    end
+    def save_to_sql
+      fields, values = [], []
+      fields_complete = false
+      # gathers fields and values
+      data.each_with_index do |e, index|
+        values[index] = []
+        e.each do |key, value|
+          fields << key.to_s unless fields_complete
+          values[index] << client.escape(value.to_s)
+        end
+        fields_complete = true
+      end
+      # creates values string
+      each_record_values = []
+      values.each do |e|
+        each_record_values << "('#{e.join("', '")}')"
+      end
+      sql = "INSERT INTO #{table} (#{fields.join(', ')}) VALUES #{each_record_values.join(', ')}"
+    end
+    def save(client = @client)
+      client.query(save_to_sql)
+    end
+    def load_to_sql
+      "SELECT * FROM #{table}"
+    end
+    def load_data(client = @client)
+      [].tap do |e|
+        client.query(load_to_sql).each(symbolize_keys: true) { |row| e << row }
+      end
+    end
+  end
+end

data/lib/search_logger/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module SearchLogger
+  VERSION = "0.0.1"
+end

data/lib/search_logger/xml_parser.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module SearchLogger
+  class XmlParser
+    attr_reader :file
+    def initialize(xml_file)
+      @file = File.open(xml_file)
+    end
+    def parse(path = 'keywords keyword')
+      require "nokogiri"
+      doc = Nokogiri::XML @file
+      doc.css(path).each_with_object([]) { |e, all| all << e.content }
+    end
+  end
+end

data/lib/search_logger.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require "search_logger/version"
+require "search_logger/csv_exporter"
+require "search_logger/google_parser"
+require "search_logger/google_parser/result"
+require "search_logger/persistence"
+require "search_logger/xml_parser"
+module SearchLogger
+end

data/schema.sql ADDED Viewed

@@ -0,0 +1,15 @@
+DROP DATABASE IF EXISTS search_logger;
+CREATE DATABASE search_logger;
+USE search_logger;
+CREATE TABLE google_results(
+  id int auto_increment,
+  searched_keyword varchar(250),
+  title text,
+  url text,
+  description text,
+  position int,
+  created_at datetime,
+  PRIMARY KEY(id)
+) DEFAULT CHARACTER SET utf8;
+CREATE INDEX position_idx ON google_results(position);

data/search_logger.gemspec ADDED Viewed

@@ -0,0 +1,26 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "search_logger/version"
+Gem::Specification.new do |s|
+  s.name        = "search_logger"
+  s.version     = SearchLogger::VERSION
+  s.authors     = ["kurko"]
+  s.email       = ["chavedomundo@gmail.com"]
+  s.homepage    = ""
+  s.summary     = %q{Searches Google and saves results.}
+  s.description = %q{This gem read a XML file, searching for each one of the keywords. Then, all results are saved into MySQL and later exported to CSV. This is a concept app.}
+  s.rubyforge_project = "search_logger"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.add_development_dependency "rspec"
+  s.add_development_dependency "guard"
+  s.add_runtime_dependency "nokogiri"
+  s.add_runtime_dependency "httpclient"
+  s.add_runtime_dependency "mysql2"
+end

data/spec/acceptance/google_parser_acceptance_spec.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# encoding: utf-8
+require "spec_helper"
+describe "Google parser" do
+  before :all do
+    @response = SearchLogger::GoogleParser.new.query('test').per_page(5).search
+  end
+  it "searches for a keyword" do
+    @response.should be_kind_of Array
+  end
+  it "searches with hebraic keywords" do
+    @response = SearchLogger::GoogleParser.new.query('שפות תכנות').per_page(5).search
+    @response.should have(5).items
+  end
+  # depending on the query string, Google returns more or less than 100 links
+  it "returns around 100 results per page by default" do
+    response = SearchLogger::GoogleParser.new.query('amazon').search
+    response.should have_at_least(95).items
+    response.should have_at_most(105).items
+  end
+  context "multiple pages" do
+    before :all do
+      @all_results = SearchLogger::GoogleParser.new.query('amazon').per_page(5).search.map { |e| e.title }
+    end
+    it "returns 2 results" do
+      (3..7).should cover(@all_results.size)
+    end
+    # Google might include news in some responses, so we don't compare them with ==
+    it "takes the first 2 pages of results" do
+      @page_one = SearchLogger::GoogleParser.new.query('amazon').per_page(1).page(1)
+      @page_two = SearchLogger::GoogleParser.new.query('amazon').per_page(1).page(2)
+      result = @page_one.search.map { |e| e.title } + @page_two.search.map { |e| e.title }
+      @all_results.should include(result.first, result.last)
+    end
+    it "has the right position numbers" do
+      @page_one = SearchLogger::GoogleParser.new.query('amazon').per_page(1).page(1).search
+      @page_two = SearchLogger::GoogleParser.new.query('amazon').per_page(1).page(2).last_result(@page_one).search
+      @page_one.first.position.should == 1
+      @page_two.first.position.should == @page_one.size + 1
+    end
+  end
+  context "for each result" do
+    it "extracts title" do
+      @response.each { |e| e.title.should_not == "" }
+    end
+    it "extract URL" do
+      @response.each { |e| e.url.should_not == "" }
+    end
+    it "extract description" do
+      @response.each { |e| e.description.should_not == "" }
+    end
+  end
+  describe "parsing a mocked response", wip: true do
+    let(:result_double) { File.open('spec/support/file_repository/google_result_2.html').read }
+    context "item 1" do
+      subject { SearchLogger::GoogleParser.new(result_double).search[0] }
+      its(:title)       { should == "Xovi: mehr als ein SEO Tool - online Marketing (SEO, SEM, Affiliate ..." }
+      its(:url)         { should == "http://www.xovi.de/" }
+      its(:description) { should == "Setzen Sie unsere SEO Software f?r Ihr Online Marketing Budget intelligent und erfolgreich ein. Verlassen Sie sich nicht auf Ihr Bauchgef?hl oder Ihre Erfahrung ..." }
+      its(:position)    { should == 1 }
+    end
+    context "item 7" do
+      subject { SearchLogger::GoogleParser.new(result_double).search[7] }
+      its(:title)       { should == "SEO Company India" }
+      its(:url)         { should == "http://www.seosoftwareservices.com/" }
+      its(:description) { should == "SEO Company India Services SEO Company Offers Standard SEO Company to enhance your ranking.We Provide Professional SEO Company India, SEO India ..." }
+      its(:position)    { should == 8 }
+    end
+    context "item 18" do
+      subject { SearchLogger::GoogleParser.new(result_double).search[18] }
+      its(:title)       { should == "Microsite Masters Rank Tracker - Accurate Keyword Tracking for ..." }
+      its(:url)         { should == "http://www.micrositemasters.com/" }
+      its(:description) { should == "Microsite Masters search engine optimization software tracks keywords across multiple URL's, ..." }
+      its(:position)    { should == 19 }
+    end
+  end
+end

data/spec/acceptance/mysql_to_csv_exportation_acceptance_spec.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# encoding: utf-8
+require "spec_helper"
+describe "Exporting MySQL data to a CSV file" do
+  let(:data) {
+    [
+      { id: 726, searched_keyword: "amazon", title: "This is a title",
+        url: "www.github.com", description: "First description.",
+        position: 1, created_at: nil },
+      { id: 727, searched_keyword: "שפות תכנות", title: "שפות תכנות",
+        url: "www.github.com", description: "שפות, תכנות.",
+        position: 2, created_at: nil },
+      { id: 728, searched_keyword: "amazon", title: "This is the, third title",
+        url: "www.github.com", description: "Third description.",
+        position: 3, created_at: nil }
+    ]
+  }
+  let(:target_file) { File.expand_path("../../support/file_repository/exported_from_persistence.csv", __FILE__) }
+  before do
+    File.delete target_file if File.exists? target_file
+  end
+  it "if no data is sent, no data is saved" do
+    CSVExporter.new.export [], to: target_file
+  end
+  it "saves data into a CSV file" do
+    CSVExporter.new.export data, to: target_file
+    File.exists?(target_file).should be_true
+    saved_data = CSV.parse File.read(target_file)
+    saved_data[0].join(',').should == 'keyword,position,url,title,description'
+    saved_data[1].join(',').should == 'amazon,1,www.github.com,This is a title,First description.'
+    saved_data[2].join(',').should == 'שפות תכנות,2,www.github.com,שפות תכנות,שפות, תכנות.'
+    saved_data[3].join(',').should == 'amazon,3,www.github.com,This is the, third title,Third description.'
+  end
+  pending "check if dir has write permission"
+end

data/spec/acceptance/persistence_acceptance_spec.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require "spec_helper"
+describe "Data persistence" do
+  let(:data) {
+    [ {searched_keyword: "amazon", title: "This is a title", url: "www.github.com", description: "First description.", position: 1},
+      {searched_keyword: "amazon", title: "This is the second title", url: "www.github.com", description: "Second description.", position: 2},
+      {searched_keyword: "amazon", title: "This is the third title", url: "www.github.com", description: "Third description.", position: 3}
+    ]
+  }
+  before do
+    @persistence = SearchLogger::Persistence.new
+    @persistence.client.query("DELETE FROM google_results")
+  end
+  it "stores an array of values in the database" do
+    @persistence.data(data)
+    @persistence.table('google_results').save
+    @another_persistence_object = SearchLogger::Persistence.new
+    saved_data = @another_persistence_object.table("google_results").load_data
+    saved_data.map { |e| e.tap { |x| x.delete(:id) }.tap { |x| x.delete(:created_at) } }.should == data
+  end
+  pending "when there's no database created"
+end