RubyGems - news_crawler - Versions diffs - 0.0.0.pre.1 - Mend

news_crawler 0.0.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/bin/news_crawler +94 -0
data/lib/news_crawler/autostart.rb +33 -0
data/lib/news_crawler/config.rb +53 -0
data/lib/news_crawler/crawler_module.rb +70 -0
data/lib/news_crawler/default_config.yml +13 -0
data/lib/news_crawler/default_sds.yml +1 -0
data/lib/news_crawler/downloader.rb +112 -0
data/lib/news_crawler/link_selector/same_domain_selector.rb +172 -0
data/lib/news_crawler/nc_logger.rb +49 -0
data/lib/news_crawler/storage/raw_data/mongo_storage.rb +77 -0
data/lib/news_crawler/storage/raw_data/raw_data_engine.rb +67 -0
data/lib/news_crawler/storage/raw_data.rb +74 -0
data/lib/news_crawler/storage/url_queue/mongo_storage.rb +218 -0
data/lib/news_crawler/storage/url_queue/url_queue_engine.rb +124 -0
data/lib/news_crawler/storage/url_queue/url_queue_error.rb +28 -0
data/lib/news_crawler/storage/url_queue.rb +150 -0
data/lib/news_crawler/url_helper.rb +50 -0
data/lib/news_crawler/utils/robots_patch.rb +34 -0
data/lib/news_crawler.rb +47 -0
metadata +203 -0

data/lib/news_crawler/storage/raw_data/raw_data_engine.rb ADDED Viewed

@@ -0,0 +1,67 @@
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+module NewsCrawler
+  module Storage
+    module RawData
+      # Basic class for RawData engine.
+      # Subclass and implement all its method to create new RawData engine,
+      # you should keep methods' singature unchanged
+      class RawDataEngine
+        def self.inherited(klass)
+          @engine_list = (@engine_list || []) + [klass]
+        end
+        # Get engine list
+        # @return [ Array ] list of url queue engines
+        def self.get_engines
+          @engine_list = @engine_list || []
+          @engine_list.inject({}) do | memo, klass |
+            memo[klass::NAME.intern] = klass
+            memo
+          end
+        end
+        # Add entry to raw data collection
+        # param [ String ] url
+        # param [ String ] body
+        def add(url, body)
+          raise NotImplementedError
+        end
+        # Find document with correspond url
+        # @param  [ String      ] url
+        # @return [ String, nil ]
+        def find_by_url(url)
+          raise NotImplementedError
+        end
+        def count
+          raise NotImplementedError
+        end
+        def clear
+          raise NotImplementedError
+        end
+      end
+    end
+  end
+end

data/lib/news_crawler/storage/raw_data.rb ADDED Viewed

@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'simpleconfig'
+require 'news_crawler/storage/raw_data/mongo_storage'
+require 'news_crawler/storage/raw_data/raw_data_engine'
+module NewsCrawler
+  module Storage
+    # store raw data from website
+    module RawData
+      class << self
+        # Set RawData storage engine
+        # @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
+        # @param [ Hash           ] opts options pass to engine
+        #   This can be
+        #   * `:mongo`, `:mongodb` for MongoDB backend
+        def set_engine(engine, *opts)
+          if engine.respond_to? :intern
+            engine = engine.intern
+          end
+          engine_class = RawDataEngine.get_engines[engine]
+          if engine_class
+            @engine = engine_class.new(*opts)
+          else
+            @engine = engine
+          end
+        end
+        # Add entry to raw data collection
+        # param [ String ] url
+        # param [ String ] body
+        def add(url, body)
+          @engine.add(url, body)
+        end
+        # Find document with correspond url
+        # @param  [ String      ] url
+        # @return [ String, nil ]
+        def find_by_url(url)
+          @engine.find_by_url url
+        end
+        def count
+          @engine.count
+        end
+        def clear
+          @engine.clear
+        end
+      end
+    end
+  end
+end

data/lib/news_crawler/storage/url_queue/mongo_storage.rb ADDED Viewed

@@ -0,0 +1,218 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'mongo'
+require 'news_crawler/storage/url_queue/url_queue_error'
+require 'news_crawler/storage/url_queue/url_queue_engine'
+module NewsCrawler
+  module Storage
+    module URLQueue
+      # List storage engine with MongoDB backend
+      class MongoEngine < NewsCrawler::Storage::URLQueue::URLQueueEngine
+        NAME = 'mongo'
+        require 'mongo'
+        include Mongo
+        # Construct a queue
+        def initialize(*opts)
+          config = SimpleConfig.for :application
+          db = MongoClient.new(config.mongodb.host, config.mongodb.port,
+                               pool_size: 4,
+                               pool_timeout: 5)[config.mongodb.db_name]
+          coll_name = config.prefix + '_' + config.suffix.url_queue
+          h_opts = ((opts[-1].is_a? Hash) ? opts[-1] : {})
+          @coll = db[h_opts[:coll_name] || coll_name]
+          @coll.ensure_index({:url => Mongo::ASCENDING}, {:unique => true})
+        end
+        # Add an URL to list with reference URL
+        # @param [ String ] url
+        # @param [ String ] ref_url
+        def add(url, ref_url = '')
+          if (ref_url == '')
+            depth = 0
+          else
+            depth = (get_url_depth(ref_url) || 0) + 1
+          end
+          begin
+            @coll.insert({:url        => url,
+                           :depth     => depth,
+                           :visited   => false})
+          rescue Mongo::OperationFailure => e
+            if e.error_code == 11000  # duplicate key error
+              raise DuplicateURLError, url
+            else
+              raise e
+            end
+          end
+        end
+        # Mark an URL as visited
+        # @param [ String ] url
+        def mark_visited(url)
+          @coll.update({:url  => url},
+                       {:$set => {'visited' => true}})
+        end
+        # Mark all URLs as unvisited
+        def mark_all_unvisited
+          @coll.update({},
+                       {:$set => {'visited' => false}},
+                       {:multi => true})
+        end
+        # # Mark an URL as processed
+        # # @param [ String ] url
+        # def mark_processed(url, **opts)
+        #   @coll.update({:url  => url},
+        #                {:$set => {:processed => true}})
+        # end
+        # Set processing state of url in given module
+        # @param [ String ] module_name
+        # @param [ String ] url
+        # @param [ String ] state one of unprocessed, processing, processed
+        def mark(module_name, url, state)
+          @coll.update({:url  => url},
+                       {:$set => {module_name => state}})
+        end
+        # Change all url in an state to other state
+        # @param [ String ] module_name
+        # @param [ String ] new_state   new state
+        # @param [ String ] orig_state  original state
+        def mark_all(module_name, new_state, orig_state = nil)
+          selector = (orig_state.nil? ? {} : {module_name => orig_state})
+          @coll.update(selector,
+                       {:$set => {module_name => new_state}},
+                       :multi => true)
+        end
+        # Get all URL and status
+        # @return [ Array ] array of hash contains url and status
+        def all(*opts)
+          @coll.find.collect do | entry |
+            entry.each_key.inject({}) do | memo, key |
+              if key != '_id'
+                memo[key.intern] = entry[key]
+              end
+              memo
+            end
+          end
+        end
+        # TODO fix bug - find *visited* url
+        # Find all visited urls with given module process state
+        # @param  [ String ] modul_name
+        # @param  [ String ] state one of unprocessed, processing, processed
+        # @param  [ Fixnum ] max_depth max url depth return (inclusive)
+        # @return [ Array  ] URL list
+        def find_all(modul_name, state, max_depth = -1)
+          if (state == URLQueue::UNPROCESSED)
+            selector = {:$or => [{modul_name => state},
+                                 {modul_name => {:$exists => false}}]}
+          else
+            selector = {modul_name => state}
+          end
+          selector = {:$and => [selector,
+                                {'visited' => true}]}
+          if max_depth > -1
+            selector[:$and] << {'depth' => {:$lte => max_depth}}
+          end
+          @coll.find(selector).collect do | entry |
+            entry['url']
+          end
+        end
+        # Find one visited url with given module process state
+        # @param  [ String      ] modul_name
+        # @param  [ String      ] state one of unprocessed, processing, processed
+        # @param  [ Fixnum      ] max_depth max url depth return (inclusive)
+        # @return [ String, nil ] URL or nil if cann't found url matches criterial
+        def find_one(modul_name, state, max_depth = -1)
+          a = find_all(modul_name, state, max_depth)
+          if a.size > 0
+            a[0]
+          else
+            nil
+          end
+        end
+        # Get next unprocessed a url and mark it as processing in atomic
+        # @param  [ String      ] modul_name
+        # @param  [ Fixnum      ] max_depth max url depth return (inclusive)
+        # @return [ String, nil ] URL or nil if url doesn't exists
+        def next_unprocessed(modul_name, max_depth = -1)
+          selector = {:$or => [{modul_name => URLQueue::UNPROCESSED},
+                               {modul_name => {:$exists => false}}]}
+          selector = {:$and => [selector,
+                                {'visited' => true}]}
+          if max_depth > -1
+            selector[:$and] << {'depth' => {:$lte => max_depth}}
+          end
+          doc = @coll.find_and_modify(:query => selector,
+                                      :update => {:$set =>
+                                        {modul_name => URLQueue::PROCESSING}})
+          if doc.nil?
+            nil
+          else
+            doc['url']
+          end
+          (doc.nil? ? nil : doc['url'])
+        end
+        alias :find_and_mark :next_unprocessed
+        # Get list of unvisited URL
+        # @param  [ Fixnum ] max_depth maximum depth of url return
+        # @return [ Array  ] unvisited url with maximum depth (option)
+        def find_unvisited(max_depth = -1)
+          if max_depth > -1
+            selector = {:$and => [{'visited' => false},
+                                  {'depth'   => {:$lte => max_depth}}]}
+          else
+            selector = {'visited' => false}
+          end
+          @coll.find(selector).collect do | entry |
+            entry['url']
+          end
+        end
+        # Clear URL queue
+        # @return [ Fixnum ] number of urls removed
+        def clear(*opts)
+          count = @coll.count
+          @coll.remove
+          count
+        end
+        # Get URL depth of given url
+        # @param [ String ] url
+        # return [ Fixnum ] URL depth
+        def get_url_depth(url)
+          @coll.find_one({'url' => url}, {:fields => ['depth']})['depth']
+        end
+      end
+    end
+  end
+end

data/lib/news_crawler/storage/url_queue/url_queue_engine.rb ADDED Viewed

@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+module NewsCrawler
+  module Storage
+    module URLQueue
+      # Basic class for URLQueue engine.
+      # Subclass and implement all its method to create new URLQueue engine,
+      # you should keep methods' singature unchanged
+      class URLQueueEngine
+        def self.inherited(klass)
+          @engine_list = (@engine_list || []) + [klass]
+        end
+        # Get engine list
+        # @return [ Array ] list of url queue engines
+        def self.get_engines
+          @engine_list = @engine_list || []
+          @engine_list.inject({}) do | memo, klass |
+            memo[klass::NAME.intern] = klass
+            memo
+          end
+        end
+        # Set processing state of url in given module
+        # @param [ String ] module_name
+        # @param [ String ] url
+        # @param [ String ] state one of unprocessed, processing, processed
+        def mark(module_name, url, state)
+          raise NotImplementedError
+        end
+        # Change all url in an state to other state
+        # @param [ String ] module_name
+        # @param [ String ] new_state   new state
+        # @param [ String ] orig_state  original state
+        def mark_all(module_name, new_state, orig_state = nil)
+          raise NotImplementedError
+        end
+        # Produce next unprocessed url and mark it as processing
+        # @param  [ String      ] module_name
+        # @return [ String, nil ]
+        def next_unprocessed(module_name)
+          raise NotImplementedError
+        end
+        # Find all visited urls with module's state
+        # @param  [ String ] module_name
+        # @param  [ String ] state
+        # @param  [ Fixnum ] max_depth max url depth return (inclusive)
+        # @return [ Array  ] URL list
+        def find_all(module_name, state, max_depth = -1)
+          raise NotImplementedError
+        end
+        # Find one visited url with given module process state
+        # @param  [ String      ] module_name
+        # @param  [ String      ] state one of unprocessed, processing, processed
+        # @param  [ Fixnum      ] max_depth max url depth return (inclusive)
+        # @return [ String, nil ] URL
+        def find_one(module_name, state, max_depth = -1)
+          raise NotImplementedError
+        end
+        # Get list of unvisited URL
+        # @param  [ Fixnum ] max_depth maximum depth of url return
+        # @return [ Array  ] unvisited url with maximum depth (option)
+        def find_unvisited(max_depth = -1)
+          raise NotImplementedError
+        end
+        # Add url with reference url
+        # @param [ String ] url URL
+        # @param [ String ] ref_url reference URL
+        def add(url, ref_url = '')
+          raise NotImplementedError
+        end
+        # Clear URLQueue
+        # @return [ Fixnum ] number of urls removed
+        def clear
+          raise NotImplementedError
+        end
+        # Mark an URL as visited
+        # @param [ String ] url
+        def mark_visited(url)
+          raise NotImplementedError
+        end
+        # Mark all URLs as unvisited
+        def mark_all_unvisited
+          raise NotImplementedError
+        end
+        # Get all url with status
+        # @return [ Array ] URL list
+        def all
+          raise NotImplementedError
+        end
+      end
+    end
+  end
+end

data/lib/news_crawler/storage/url_queue/url_queue_error.rb ADDED Viewed

@@ -0,0 +1,28 @@
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+module NewsCrawler
+  module Storage
+    module URLQueue
+      class DuplicateURLError < StandardError; end
+    end
+  end
+end

data/lib/news_crawler/storage/url_queue.rb ADDED Viewed

@@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'simple_config'
+require 'news_crawler/storage/url_queue/mongo_storage'
+require 'news_crawler/storage/url_queue/url_queue_engine'
+module NewsCrawler
+  module Storage
+    # Store and manipulate url queue
+    module URLQueue
+      ACTION_LIST = [:mark_visited, :mark_processed, :find_unvisited,
+                     :find_unprocessed, :find_unprocessed_with_depth]
+      PROCESSED   = 'processed'
+      PROCESSING  = 'processing'
+      UNPROCESSED = 'unprocessed'
+      class << self
+        # Set URLQueue storage engine
+        # @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
+        # @param [ Hash           ] opts options pass to engine
+        #   This can be
+        #   * `:mongo`, `:mongodb` for MongoDB backend
+        def set_engine(engine, *opts)
+          if engine.respond_to? :intern
+            engine = engine.intern
+          end
+          engine_class = URLQueueEngine.get_engines[engine]
+          if engine_class
+            @engine = engine_class.new(*opts)
+          else
+            @engine = engine
+          end
+        end
+        # Mark an URL as visited
+        # @param [ String ] url
+        def mark_visited(url)
+          url = normalize_url url
+          @engine.mark_visited(url)
+        end
+        # Mark all URLs as unvisited
+        def mark_all_unvisited
+          @engine.mark_all_unvisited
+        end
+        # Set processing state of url in given module
+        # @param [ String ] module_name
+        # @param [ String ] url
+        # @param [ String ] state one of unprocessed, processing, processed
+        def mark(module_name, url, state)
+          url = normalize_url url
+          @engine.mark(module_name, url, state)
+        end
+        # Mark all url to state
+        # @param [ String ] module_name
+        # @param [ String ] new_state   new state
+        # @param [ String ] orig_state  original state
+        def mark_all(module_name, new_state, orig_state = nil)
+          @engine.mark_all(module_name, new_state, orig_state)
+        end
+        # Find all visited urls with module's state
+        # @param  [ String ] module_name
+        # @param  [ String ] state
+        # @param  [ Fixnum ] max_depth max url depth return (inclusive)
+        # @return [ Array  ] URL list
+        def find_all(module_name, state, max_depth = -1)
+          @engine.find_all(module_name, state, max_depth)
+        end
+        # Find one visited url with given module process state
+        # @param  [ String      ] module_name
+        # @param  [ String      ] state one of unprocessed, processing, processed
+        # @param  [ Fixnum      ] max_depth max url depth return (inclusive)
+        # @return [ String, nil ] URL
+        def find_one(module_name, state, max_depth = -1)
+          @engine.find_one(module_name, state, max_depth)
+        end
+        # Get next unprocessed a url and mark it as processing in atomic
+        # @param  [ String      ] module_name
+        # @param  [ Fixnum      ] max_depth max url depth return (inclusive)
+        # @return [ String, nil ] URL or nil if url doesn't exists
+        def next_unprocessed(module_name, max_depth = -1)
+          @engine.next_unprocessed(module_name, max_depth)
+        end
+        # Get list of unvisited URL
+        # @param  [ Fixnum ] max_depth maximum depth of url return
+        # @return [ Array  ] unvisited url with maximum depth (option)
+        def find_unvisited(max_depth = -1)
+          @engine.find_unvisited(max_depth)
+        end
+        # Add URL to queue
+        # @param [ String ] url
+        # @param [ String ] ref_url reference url
+        def add(url, ref_url = '')
+          url = normalize_url url
+          if ref_url != ''
+            ref_url = normalize_url ref_url
+          end
+          @engine.add(url, ref_url)
+        end
+        # Clear URLQueue
+        # @return [ Fixnum ] number of urls removed
+        def clear
+          @engine.clear
+        end
+        # Get all url with status
+        # @return [ Array ] URL list
+        def all
+          @engine.all
+        end
+        def normalize_url(url)
+          if (!url.start_with? "http")
+            "http://" + url
+          else
+            url
+          end
+        end
+      end
+    end
+  end
+end