RubyGems - news_crawler - Versions diffs - 0.0.0.pre.1 - Mend

news_crawler 0.0.0.pre.1

Files changed (21) hide show

checksums.yaml +7 -0
data/bin/news_crawler +94 -0
data/lib/news_crawler/autostart.rb +33 -0
data/lib/news_crawler/config.rb +53 -0
data/lib/news_crawler/crawler_module.rb +70 -0
data/lib/news_crawler/default_config.yml +13 -0
data/lib/news_crawler/default_sds.yml +1 -0
data/lib/news_crawler/downloader.rb +112 -0
data/lib/news_crawler/link_selector/same_domain_selector.rb +172 -0
data/lib/news_crawler/nc_logger.rb +49 -0
data/lib/news_crawler/storage/raw_data/mongo_storage.rb +77 -0
data/lib/news_crawler/storage/raw_data/raw_data_engine.rb +67 -0
data/lib/news_crawler/storage/raw_data.rb +74 -0
data/lib/news_crawler/storage/url_queue/mongo_storage.rb +218 -0
data/lib/news_crawler/storage/url_queue/url_queue_engine.rb +124 -0
data/lib/news_crawler/storage/url_queue/url_queue_error.rb +28 -0
data/lib/news_crawler/storage/url_queue.rb +150 -0
data/lib/news_crawler/url_helper.rb +50 -0
data/lib/news_crawler/utils/robots_patch.rb +34 -0
data/lib/news_crawler.rb +47 -0
metadata +203 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 3cdcc66767575d17c8a1f5d25f00713f74d51e98
+  data.tar.gz: 7c7d2c838066a45c4365a0756625e4a08c5d4482
+SHA512:
+  metadata.gz: 75f259b0cafbff494302955a1b7968d331d69298c8e5a31185b0b8288408f7af323721eb83a2043005e2acb7e12337051622e191356da497892eb35701dfe7d8
+  data.tar.gz: 4382b0ed8d4bc28134d8af322f9a2f46b95a8732b2629835ae0b49da4e3ecdaa6fcce671c7fd5adc8cc198fb8e107360678277767b9cc9d706bae7a6530b2dd4

data/bin/news_crawler ADDED Viewed

@@ -0,0 +1,94 @@
+#! /usr/bin/env ruby
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'optparse'
+require 'news_crawler/config'
+require 'news_crawler/nc_logger'
+require 'news_crawler/downloader'
+require 'news_crawler/link_selector/same_domain_selector'
+include NewsCrawler::Storage
+options = {}
+OptionParser.new do | opts |
+  opts.banner = "Usage: news_crawler [options] url"
+  opts.on('-c', "--app-conf FILE", "Application configuration file") do | f |
+    options[:app_conf] = File.expand_path(f)
+    raise Errno::ENOENT unless File.exists? options[:app_conf]
+  end
+  opts.on('-sds', "--sds-conf FILE", "Same domain selector configuration file") do | f |
+    options[:sds_conf] = File.expand_path(f)
+    raise Errno::ENOENT unless File.exists? options[:sds_conf]
+  end
+  opts.on('-c', '--[no-]cleardb', "Clear database") do | cd |
+    options[:cleardb] = cd
+  end
+  opts.on('-d', "--max-depth DEPTH", OptionParser::DecimalInteger,
+          'Maximum depth of url to crawl') do  | d |
+    options[:max_depth] = d
+  end
+end.parse!
+NewsCrawler::CrawlerConfig.load_application_config(options[:app_conf]) unless options[:app_conf].nil?
+NewsCrawler::CrawlerConfig.load_samedomainselector_config(options[:sds_conf]) unless options[:sds_conf].nil?
+config = SimpleConfig.for :application
+NewsCrawler::Storage::RawData.set_engine(config.db.engine.intern)
+NewsCrawler::Storage::URLQueue.set_engine(config.db.engine.intern)
+if (options[:cleardb])
+  URLQueue.clear
+  RawData.clear
+end
+if ARGV.size > 0
+  url = ARGV[0]
+  URLQueue.add(url)
+end
+puts "Starting Downloader"
+dwl = NewsCrawler::Downloader.new(false)
+dwl.async.run
+puts "Starting SDS"
+se = NewsCrawler::LinkSelector::SameDomainSelector.new(options[:max_depth] || 1, false)
+se.async.run
+puts "Stoping SDS"
+se.graceful_terminate
+se.terminate
+puts "SDS stopped"
+sleep(5)
+puts "Stoping Downloader"
+dwl.graceful_terminate
+dwl.terminate
+puts "Downloader stopped"

data/lib/news_crawler/autostart.rb ADDED Viewed

@@ -0,0 +1,33 @@
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'simple_config'
+require 'mongo'
+require 'news_crawler/config'
+require 'news_crawler/storage/url_queue'
+SimpleConfig.for(:application) do
+  set :prefix, 'test'
+end
+config = SimpleConfig.for :application
+NewsCrawler::Storage::URLQueue.set_engine(config.db.engine.intern)

data/lib/news_crawler/config.rb ADDED Viewed

@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'simple_config'
+module NewsCrawler
+  class CrawlerConfig
+    DEFAULT_CONFIG     = File.join(File.dirname(__FILE__),
+                                   './default_config.yml')
+    DEFAULT_SDS_CONFIG = File.join(File.dirname(__FILE__),
+                                   './default_sds.yml')
+    def self.load_application_config(file = CrawlerConfig::DEFAULT_CONFIG)
+      if ((file != DEFAULT_CONFIG) || (@app_loaded != true))
+        @app_loaded = true
+        SimpleConfig.for :application do
+          load file
+        end
+      end
+    end
+    def self.load_samedomainselector_config(file = CrawlerConfig::DEFAULT_SDS_CONFIG)
+      if ((file != DEFAULT_SDS_CONFIG) || (@sds_loaded != true))
+        @sds_loaded = true
+        SimpleConfig.for :same_domain_selector do
+          load file
+        end
+      end
+    end
+  end
+end
+NewsCrawler::CrawlerConfig.load_application_config
+NewsCrawler::CrawlerConfig.load_samedomainselector_config

data/lib/news_crawler/crawler_module.rb ADDED Viewed

@@ -0,0 +1,70 @@
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'news_crawler/storage/url_queue'
+require 'thread'
+module NewsCrawler
+  # Include this to get basic module methods
+  module CrawlerModule
+    # Mark current url process state of current module is processed
+    # @param [ String ] url
+    def mark_processed(url)
+      URLQueue.mark(self.class.name, url, URLQueue::PROCESSED)
+    end
+    # Mark current url process state of current module is unprocessed
+    # @param [ String ] url
+    def mark_unprocessed(url)
+      URLQueue.mark(self.class.name, url, URLQueue::UNPROCESSED)
+    end
+    # Find all visited unprocessed url
+    # @param  [ Fixnum ] max_depth max url depth return (inclusive)
+    # @return [ Array  ]  URL list
+    def find_unprocessed(max_depth = -1)
+      URLQueue.find_all(self.class.name, URLQueue::UNPROCESSED, max_depth)
+    end
+    # Find one visited url with given current module process state
+    # @param  [ String ] state one of unprocessed, processing, processed
+    # @param  [ Fixnum ] max_depth max url depth return (inclusive)
+    # @return [ Array  ]  URL list
+    def find_all(state, max_depth = -1)
+      URLQueue.find_all(self.class.name, state, max_depth)
+    end
+    # Find all visited urls with current module's state
+    # @param  [ String      ] state
+    # @param  [ Fixnum      ] max_depth max url depth return (inclusive)
+    # @return [ String, nil ] URL or nil if url doesn't exists
+    def find_one(state, max_depth = -1)
+      URLQueue.find_one(self.class.name, state, max_depth)
+    end
+    # Get next unprocessed a url and mark it as processing in atomic
+    # @param  [ Fixnum ] max_depth max url depth return (inclusive)
+    # @return [ String, nil ] URL or nil if url doesn't exists
+    def next_unprocessed(max_depth = -1)
+      URLQueue.next_unprocessed(self.class.name, max_depth)
+    end
+  end
+end

data/lib/news_crawler/default_config.yml ADDED Viewed

@@ -0,0 +1,13 @@
+db:
+  :engine: :mongo
+:mongodb:
+  :host: localhost
+  :port: !str 27017
+  :db_name: news-crawler
+:suffix:
+  :raw_data: raw_data
+  :url_queue: url_queue
+prefix: ''

data/lib/news_crawler/default_sds.yml ADDED Viewed

	@@ -0,0 +1 @@
1	+ :exclude:

data/lib/news_crawler/downloader.rb ADDED Viewed

@@ -0,0 +1,112 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'celluloid'
+require 'typhoeus'
+require 'simpleconfig'
+require 'news_crawler/config'
+require 'news_crawler/storage/raw_data'
+require 'news_crawler/utils/robots_patch'
+require 'news_crawler/nc_logger'
+module NewsCrawler
+  # This class implement an parallel downloader based on Typhoes
+  # with given queue
+  class Downloader
+    include Celluloid
+    CONCURRENT_DOWNLOAD = 4
+    # Construct downloader with an URLQueue
+    # @param [ Boolean ] start_on_create whether start selector immediately
+    # @param [ NewsCrawler::URLQueue ] queue url queue
+    def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
+      @queue = queue
+      @urls = queue.find_unvisited
+      @concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
+      @wait_time = 1
+      @status = :running
+      @stoping = false
+      wait_for_url if start_on_create
+    end
+    # Start downloader with current queue
+    # URL successed fetch is marked and result's stored in DB
+    def run
+      @status = :running
+      hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
+      # TODO Log here
+      @urls = @urls.keep_if do | url |
+        Robots.instance.allowed? url
+      end
+      requests = @urls.map do | url |
+        re = Typhoeus::Request.new(url, followlocation: true)
+        re.on_complete do | response |
+          if response.success?
+            Storage::RawData.add(url, response.response_body)
+            @queue.mark_visited url
+          else
+            NCLogger.get_logger.warn("[WARNING] Fetch error [#{url}]")
+          end
+        end
+        hydra.queue re
+        re
+      end
+      hydra.run
+      @urls = []
+      wait_for_url
+    end
+    # Graceful terminate this downloader
+    def graceful_terminate
+      @stoping = true
+      while @status == :running
+        sleep(1)
+      end
+    end
+    private
+    # Waiting for new urls're added to queue, using backoff algorithms
+    def wait_for_url
+      @status = :waiting
+      if @stoping # check for stop flag
+        return
+      end
+      sleep @wait_time
+      get_new_url
+      if @urls.size == 0
+        if @wait_time < 30
+          @wait_time = @wait_time * 2
+        end
+        wait_for_url
+      else
+        @wait_time = 1
+        run
+      end
+    end
+    def get_new_url
+      @urls = @queue.find_unvisited
+    end
+  end
+end

data/lib/news_crawler/link_selector/same_domain_selector.rb ADDED Viewed

@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'celluloid'
+require 'nokogiri'
+require 'news_crawler/storage/raw_data'
+require 'news_crawler/url_helper'
+require 'news_crawler/crawler_module'
+require 'news_crawler/nc_logger'
+module NewsCrawler
+  module LinkSelector
+    # Select all link from same domain.
+    # Domain is got from database
+    class SameDomainSelector
+      include NewsCrawler::URLHelper
+      extend NewsCrawler::URLHelper
+      include NewsCrawler::CrawlerModule
+      include Celluloid
+      # Create new selector with queue
+      # URL's selected is put back into queue
+      # @param [ Fixnum  ] max_depth maxinum depth to crawl
+      # @param [ Boolean ] start_on_create whether start selector immediately
+      def initialize(max_depth = -1, start_on_create = true)
+        @max_depth = max_depth
+        @wait_time = 1
+        @status = :running
+        @stoping = false
+        run if start_on_create
+      end
+      # Extract url from page
+      def extract_url(url)
+        doc      = RawData.find_by_url(url)
+        html_doc = Nokogiri::HTML(doc)
+        results  = []
+        inner_url = html_doc.xpath('//a').collect { | a_el |
+          temp_url = (a_el.attribute 'href').to_s
+          if (!temp_url.nil?) && (temp_url[0] == '/')
+            temp_url = url + temp_url
+          end
+          temp_url
+        }
+        inner_url.delete_if { | url |
+            (url.nil?) || (url.size == 0) || (url == '#')
+        }
+        # select url from same domain
+        inner_url.select { | o_url |
+          if (same_domain?(o_url, url))
+            if (!SameDomainSelector.exclude?(o_url))
+              begin
+                URLQueue.add(o_url, url)
+                results << [o_url, url]
+              rescue URLQueue::DuplicateURLError => e
+              end
+            else
+              # TODO Log here
+            end
+          end
+        }
+      end
+      def run
+        @status = :running
+        return if @stoping
+        if @max_depth == 0
+          @status = :stopped
+          return
+        end
+        while !@stoping
+          url = next_unprocessed(@max_depth - 1)
+          while (url.nil?)
+            wait_for_url
+            url = next_unprocessed(@max_depth - 1)
+          end
+          NCLogger.get_logger.info "Processing #{url}"
+          extract_url(url)
+          mark_processed(url)
+        end
+      end
+      # Test whether url is excluded
+      # @param [ String ] url
+      # @return [ Boolean ] true if url is excluded, false otherwise
+      def self.exclude?(url)
+        config       = SimpleConfig.for :same_domain_selector
+        exclude_list = []
+        url_domain   = get_url_path(url)[:domain]
+        begin
+          exclude_group = config.exclude
+        rescue NoMethodError => e
+          return false
+        end
+        exclude_group.to_hash.keys.each do | url_e |
+          if url_domain.to_s.end_with? url_e.to_s
+            exclude_list = config.exclude.get(url_e)
+            break
+          end
+        end
+        exclude_list = exclude_list.map do | elt |
+          if /^\/.*\/$/ =~ elt
+            Regexp.new(elt[1..-2])                        # already an Regex
+          else
+            new_elt = "^(.*/)?#{elt}(/.*)?$"
+            Regexp.new(new_elt)
+          end
+        end
+        if exclude_list.count == 0
+          return false
+        end
+        # url.split('/').each do | part |
+        #   if exclude_list.include? part
+        #     return true
+        #   end
+        # end
+        exclude_list.each do | exclude_rule |
+          if exclude_rule =~ url
+            return true
+          end
+        end
+        return false
+      end
+      # Graceful terminate this selector
+      def graceful_terminate
+        @stoping = true
+        while @status == :running
+          sleep(1)
+        end
+      end
+      private
+      # Waiting for new urls're added to queue, using backoff algorithms
+      def wait_for_url
+        @status = :waiting
+        sleep @wait_time
+        if @wait_time < 30
+          @wait_times = @wait_time * 2
+        end
+      end
+    end
+  end
+end

data/lib/news_crawler/nc_logger.rb ADDED Viewed

@@ -0,0 +1,49 @@
+#! /usr/bin/env ruby
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'logger'
+module NewsCrawler
+  class NCLogger
+    # Get logger
+    def self.get_logger
+      @logger ||= Logger.new(STDERR)
+      @logger.progname = 'news_crawler'
+      @logger
+    end
+    # Set logger level
+    # param [ Logger::Severity ] l level
+    def self.set_level(l)
+      get_logger.level = l
+    end
+    # Set logger, should same API as Ruby Logger
+    # param [ Object ] l logger
+    def self.set_logdev(ld)
+      @logger = Logger.new(ld)
+      @logger.progname = 'news_crawler'
+    end
+  end
+end

data/lib/news_crawler/storage/raw_data/mongo_storage.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'mongo'
+require 'simple_config'
+require 'news_crawler/storage/raw_data/raw_data_engine'
+module NewsCrawler
+  module Storage
+    module RawData
+      # Raw data storage implement using MongoDB
+      class MongoStorage < NewsCrawler::Storage::RawData::RawDataEngine
+        NAME = 'mongo'
+        include Mongo
+        def initialize(*opts)
+          config = (SimpleConfig.for :application)
+          client = MongoClient.new(config.mongodb.host, config.mongodb.port)
+          db = client[config.mongodb.db_name]
+          @coll = db[config.prefix + '_' + config.suffix.raw_data]
+          @coll.ensure_index({:url => Mongo::ASCENDING}, {:unique => true})
+        end
+        # Add entry to raw data collection, overwrite old data
+        # param [ String ] url
+        # param [ String ] body
+        def add(url, body)
+          @coll.update({:url   => url},
+                       {:$set  => {:body => body}},
+                       {:upsert => true})
+        end
+        # Find document with correspond url
+        # @param  [ String      ] url
+        # @return [ String, nil ]
+        def find_by_url(url)
+          result = @coll.find_one({:url => url})
+          if (!result.nil?)
+            result['body']
+          else
+            nil
+          end
+        end
+        # Get number of raw data entries
+        def count
+          @coll.count
+        end
+        def clear
+          @coll.remove
+        end
+      end
+    end
+  end
+end