RubyGems - news_crawler - Versions diffs - 0.0.3 → 0.0.4.pre.1 - Mend

news_crawler 0.0.3 → 0.0.4.pre.1

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/news_crawler/crawler_module.rb +21 -1
data/lib/news_crawler/default_config.yml +3 -2
data/lib/news_crawler/processing/structure_analysis.rb +299 -0
data/lib/news_crawler/storage/yaml_stor/mongo_storage.rb +85 -0
data/lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb +70 -0
data/lib/news_crawler/storage/yaml_stor.rb +78 -0
metadata +8 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 50f3793b5c9e31eeaba0d7650ddf20d451de16c2
-  data.tar.gz: 5796887e83aa9912b02fa67f69bb1b1f5f34ecec
+  metadata.gz: 37f387711eea761bdcaa5cf16f30eb295b60652e
+  data.tar.gz: 127ab77d72f429a3bacf46a24367331eaadf471f
 SHA512:
-  metadata.gz: 7d97e8b6630aebad685d9c35b848f575eb293e691a3e42e1cb4c86bee1920df27893fdb51d075dad92dbd029a5eae25e26ee1a5ab8ab6f51a31078fbef479caf
-  data.tar.gz: 08486762b1261bd0a5b950e63084a216bfd2a710b15c54bceec1d604dcfe77088da2b9265f9a99093281009427d2eb7dd5766592513b54255f75dcf28c16bc97
+  metadata.gz: 278f80bc1f4eec78a2536ee74edebfe94b26b45a15c9577bedb7214325ad4cdeda6e64eaae002e7f5c48480fe4eaefb413e4e00bef518915de1fb8cfbb08321d
+  data.tar.gz: c57baf50222284a38769636c2cb4cc57e075bf8c008f32765c9dc92a2992afe6eb8d5eb32b6d5e04bf5695167c55be40c74eed4b981462d2a9a27959cc852c3b

data/lib/news_crawler/crawler_module.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 #--
 # NewsCrawler - a website crawler
 #
@@ -20,7 +21,8 @@
 #++
 require 'news_crawler/storage/url_queue'
-require 'thread'
+require 'news_crawler/storage/yaml_stor'
 module NewsCrawler
   # Include this to get basic module methods
@@ -66,5 +68,23 @@ module NewsCrawler
     def next_unprocessed(max_depth = -1)
       URLQueue.next_unprocessed(self.class.name, max_depth)
     end
+    def mark_all_as_unprocessed
+      URLQueue.mark_all(self.class.name, URLQueue::UNPROCESSED)
+    end
+    # Serialize object to YAML and save it (overwrite if key existed)
+    # @param [ String ] key
+    # @param [ Object ] value
+    def save_yaml(key, value)
+      YAMLStor.add(self.class.name, key, value)
+    end
+    # Load YAML object
+    # @param  [ String ]      key
+    # @return [ Object, nil ]
+    def load_yaml(key, value)
+      YAMLStor.get(self.class.name, key, value)
+    end
   end
 end

data/lib/news_crawler/default_config.yml CHANGED Viewed

@@ -1,4 +1,4 @@
-db:
+:db:
   :engine: :mongo
 :mongodb:
@@ -9,5 +9,6 @@ db:
 :suffix:
   :raw_data: raw_data
   :url_queue: url_queue
+  :yaml: yaml
-prefix: ''
+:prefix: ''

data/lib/news_crawler/processing/structure_analysis.rb ADDED Viewed

@@ -0,0 +1,299 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'nokogiri'
+require 'uri'
+require 'news_crawler/url_helper'
+require 'news_crawler/storage/url_queue'
+require 'news_crawler/storage/raw_data'
+require 'news_crawler/crawler_module'
+require 'news_crawler/nc_logger'
+module NewsCrawler
+  module Processing
+    # Analyse website structure to extract content
+    # Database should only contains raw data from one website.
+    class StructureAnalysis
+      include CrawlerModule
+      include URLHelper
+      def initialize
+        @url_stats = {}
+        while (url = next_unprocessed)
+          NCLogger.get_logger.info "[NC::P::SA] Processing #{url}"
+          re = extract_content(url)
+          @url_stats[url] = re
+          save_yaml(url, re)
+        end
+      end
+      def extract_content(url)
+        html_doc = RawData.find_by_url(url)
+        result = {}
+        result[:type] == :article
+        # Remove tag causing trouble to nokogiri
+        html_doc = remove_tag(html_doc, 'script')
+        html_doc = remove_tag(html_doc, 'iframe')
+        html_doc = remove_tag(html_doc, 'style')
+        doc = Nokogiri::HTML.parse(html_doc)
+        longest = find_longest_node(doc)
+        lowest_ancestor, path_to_longest = find_lowest_ancestor_has_id(longest)
+        # Heuristic 1
+        # Longest content is a element as id attribute
+        if path_to_longest.length == 2
+          return { :type => :list }
+        end
+        parent = path_to_longest[1..-1]
+        parent = parent.reverse
+        xpath_path = parent.join('/')
+        xpath_path = '//' + xpath_path + '//text()'
+        guest_type = classify_h2(longest, lowest_ancestor)
+        result = { :type => guest_type }
+        if (result[:type] == :article)
+          title_ = lowest_ancestor.css('h1')
+          if title_.count == 1
+            result[:title] = title_.to_a[0].content
+          else
+            # if cann't guest title then assume it isn't an article
+            result[:type] = :list
+          end
+          main_content = ''
+          lowest_ancestor.xpath(xpath_path).each do | node |
+            main_content += node.content
+          end
+          result[:content] = main_content
+        end
+        mark_processed(url)
+        result
+      end
+      # Predict type of tree point by root is fragment of article or index page
+      # @param [ Nokogiri::XML::Node ] root
+      # @paran [ Nokogiri::XML::Node ] limit limit node to search backward
+      # @return [ Symbol ] one of :article, :list
+      def classify_h2(root, limit)
+        current = root
+        current = current.parent if current.text?
+        depth = 0
+        while true
+          expect_hash = hash_node(current, 0)
+          previous = current
+          current = current.parent
+          depth += 1
+          lons = {}
+          node_count = 0
+          node_list = [previous]
+          current.children.each do | child |
+            hc = hash_node(child, depth - 1)
+            if hc == expect_hash
+              node_count += 1
+              node_list << child
+            end
+          end
+          if node_count > 1
+            a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
+            if non_a_tag_len > a_tag_len
+              return :article
+            else
+              return :list
+            end
+            break
+          end
+          if current == limit
+            a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
+            if non_a_tag_len > a_tag_len
+              return :article
+            else
+              return :list
+            end
+            break
+          end
+        end
+        return :list
+      end
+      # Count a tag and non-a tag in tree pointed by node
+      # @param [ Nokogiri::XML::Node ] node
+      # @return [ [Fixnum, Fixnum] ] a tag and non-a tag
+      def count_a_and_non_a_tag(node)
+        a_tag_list = node.xpath './/a'
+        a_tag_len = a_tag_list.count # number of a tag
+        non_a_tag_list = node.xpath './/text()[not (ancestor::a)]'
+        non_a_tag_len = non_a_tag_list.to_a.inject(0) do | memo, node |
+          if node.content.gsub(/\s+/, '').length > 15
+            memo + 1
+          else
+            memo
+          end
+        end
+        [ a_tag_len, non_a_tag_len ]
+      end
+      # Find the lowest node's ancestor has id attribute
+      # @param [ Nokogiri::XML::Node ] node
+      # @return [ Nokogiri::XML::Node ]
+      def find_lowest_ancestor_has_id(node)
+        found_id = false
+        closest_ancestor = node
+        path_to_closest = []
+        while (!found_id)
+          if closest_ancestor.has_attribute?('id')
+            path_to_closest << "#{closest_ancestor.node_name}[@id='#{closest_ancestor.attribute('id')}']"
+            found_id = true
+          else
+            if closest_ancestor.has_attribute?('class')
+              node_class = "@class = '#{closest_ancestor.attribute('class')}'"
+            else
+              node_class = 'not(@class)'
+            end
+            path_to_closest << "#{closest_ancestor.node_name}[#{node_class}]"
+            closest_ancestor = closest_ancestor.parent
+          end
+        end
+        return [ closest_ancestor, path_to_closest ]
+      end
+      # Find longest text node that doesn't have a in ancestors list
+      # @param [ Nokogiri::XML::Node ] doc
+      def find_longest_node(doc)
+        xpath_query = '//*[@id]//text()[not (ancestor::a)]'
+        a_l = doc.xpath xpath_query
+        longest = nil
+        longest_len = 0
+        a_l.each do | en |
+          node_content_wo_space = en.content.gsub(/\s/, '') # trick here
+          if node_content_wo_space.length > longest_len
+            longest_len = node_content_wo_space.length
+            longest = en
+          end
+        end
+        return longest
+      end
+      # Remove unwanted HTML tag
+      # @param [ String ] html_doc HTML document
+      # @param [ String ] tag tag to be removed
+      def remove_tag(html_doc, tag)
+        pattern = Regexp.new("<#{tag}.*?>.*?</#{tag}>", Regexp::MULTILINE)
+        html_doc.gsub(pattern, '')
+      end
+      # Return String represents node's name, node's id and node's class
+      # @param [ Nokogiri::XML::Node ] node
+      # @return [ String ]
+      def node_info(node)
+        node_pp = node.node_name
+        node_pp += '#' + node.attribute('id') if node.has_attribute?('id')
+        node_pp += '.' + node.attribute('class') if node.has_attribute?('class')
+        node_pp
+      end
+      # Calculate hash of a node by its and children info
+      # @param [ Nokogiri::XML::Node ] node
+      # @param [ Fixnum ] limit limit depth of children (-1 for unlimited)
+      # @return [ String ] Hash of node in base 64 encode
+      def hash_node(node, limit = -1)
+        node_sign = node.node_name
+        node_sign += "##{node['id']}" unless node['id'].nil?
+        node_sign += ".#{node['class']}" unless node['class'].nil?
+        hash_sum = node_sign
+        if limit != 0
+          child_hash = Set.new
+          node.children.each do | child_node |
+            child_hash.add(hash_node(child_node, limit - 1))
+          end
+          child_hash.each do | ch |
+            hash_sum += ch
+          end
+        else
+        end
+        Digest::SHA2.new.base64digest(hash_sum)
+      end
+      # Get and analyse url for information
+      def analyse(url)
+        #        puts "processing #{url}"
+        html_doc = RawData.find_by_url(url)
+        doc = Nokogiri.HTML(html_doc)
+        inner_url = doc.xpath('//a').collect { | a_el |
+          temp_url = (a_el.attribute 'href').to_s
+          if (!temp_url.nil?) && (temp_url[0] == '/')
+            temp_url = URI.join(url, temp_url).to_s
+          end
+          temp_url
+        }
+        inner_url.delete_if { | url_0 |
+          (url_0.nil?) || (url_0.size == 0) || (url_0 == '#') ||
+          (url_0 == 'javascript:;')
+        }
+        inner_url.each do  | url |
+          @url_stats[url] = (@url_stats[url] || 0) + 1
+        end
+        mark_processed(url)
+      end
+      # Check if it is really 'url'
+      # @param [ String ] url
+      # @return [ Boolean ]
+      def is_url?(url)
+        (url.size != 0) && (url != '#') && (url != 'javascript:;')
+      end
+      def get_result
+        @url_stats
+      end
+    end
+  end
+end

data/lib/news_crawler/storage/yaml_stor/mongo_storage.rb ADDED Viewed

@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'mongo'
+require 'yaml'
+require 'simple_config'
+require 'news_crawler/storage/yaml_stor/yaml_stor_engine'
+require 'news_crawler/nc_logger'
+module NewsCrawler
+  module Storage
+    module YAMLStor
+      # YAML storage implement using MongoDB
+      class MongoStorage < NewsCrawler::Storage::YAMLStor::YAMLStorEngine
+        NAME = 'mongo'
+        include Mongo
+        def initialize(*opts)
+          config = (SimpleConfig.for :application)
+          client = MongoClient.new(config.mongodb.host, config.mongodb.port)
+          db = client[config.mongodb.db_name]
+          @coll = db[config.prefix + '_' + config.suffix.yaml]
+          # @coll.ensure_index({:key => Mongo::ASCENDING}, {:unique => true})
+        end
+        # Add entry to yaml collection, overwrite old data
+        # @param [ String ] module_name
+        # @param [ String ] key
+        # @param [ Object ] value YAML string
+        def add(module_name, key, value)
+          yaml_str = value.to_yaml
+          yaml_str.encode!('utf-8', :invalid => :replace, :undef => :replace)
+          @coll.update({:key   => key,
+                         :m_name => module_name},
+                       {:$set  => {:value => yaml_str}},
+                       {:upsert => true})
+        end
+        # Find document with correspond key
+        # @param  [ String ] module_name
+        # @param  [ String      ] key
+        # @return [ Object, nil ]
+        def get(module_name, key)
+          result = @coll.find_one({:key => key,
+                                    :m_name => module_name})
+          if (!result.nil?)
+            YAML.load(result['value'])
+          else
+            nil
+          end
+        end
+        # Get number of raw data entries
+        def count
+          @coll.count
+        end
+        def clear
+          @coll.remove
+        end
+      end
+    end
+  end
+end

data/lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb ADDED Viewed

@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+module NewsCrawler
+  module Storage
+    module YAMLStor
+      # Basic class for YAMLStor engine.
+      # Subclass and implement all its method to create new YAMLStor engine,
+      # you should keep methods' singature unchanged
+      class YAMLStorEngine
+        def self.inherited(klass)
+          @engine_list = (@engine_list || []) + [klass]
+        end
+        # Get engine list
+        # @return [ Array ] list of url queue engines
+        def self.get_engines
+          @engine_list = @engine_list || []
+          @engine_list.inject({}) do | memo, klass |
+            memo[klass::NAME.intern] = klass
+            memo
+          end
+        end
+        # Add entry to raw data collection
+        # @param [ String ] module_name
+        # @param [ String ] key
+        # @param [ Object ] value
+        def add(module_name, key, value)
+          raise NotImplementedError
+        end
+        # Get entry to raw data collection
+        # @param [ String ] module_name
+        # @param [ String ] key
+        # @return [ Object, nil ] Value or nil if key isn't found
+        def get(module_name, key)
+          raise NotImplementedError
+        end
+        def count
+          raise NotImplementedError
+        end
+        def clear
+          raise NotImplementedError
+        end
+      end
+    end
+  end
+end

data/lib/news_crawler/storage/yaml_stor.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'simpleconfig'
+#!!!
+require 'news_crawler/storage/yaml_stor/yaml_stor_engine'
+require 'news_crawler/storage/yaml_stor/mongo_storage'
+module NewsCrawler
+  module Storage
+    # YAML data storage
+    # You can use it for store processed data or configuration
+    module YAMLStor
+      class << self
+        # Set YAMLStor storage engine
+        # @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
+        # @param [ Hash           ] opts options pass to engine
+        #   This can be
+        #   * `:mongo`, `:mongodb` for MongoDB backend
+        def set_engine(engine, *opts)
+          if engine.respond_to? :intern
+            engine = engine.intern
+          end
+          engine_class = YAMLStorEngine.get_engines[engine]
+          if engine_class
+            @engine = engine_class.new(*opts)
+          else
+            @engine = engine
+          end
+        end
+        # Add entry to YAML storage
+        # @param [ String ] module_name
+        # @param [ String ] key
+        # @param [ String ] value object to serialize
+        def add(module_name, key, value)
+          @engine.add(module_name, key, value)
+        end
+        # Find document with correspond key
+        # @param  [ String ]      module_name
+        # @param  [ String ]      key
+        # @return [ Object, nil ]
+        def get(module_name, key)
+          @engine.get(module_name, key)
+        end
+        def count
+          @engine.count
+        end
+        def clear
+          @engine.clear
+        end
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: news_crawler
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4.pre.1
 platform: ruby
 authors:
 - Hà Quang Dương
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-28 00:00:00.000000000 Z
+date: 2013-08-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mongo
@@ -164,6 +164,7 @@ files:
 - lib/news_crawler/downloader.rb
 - lib/news_crawler/link_selector/same_domain_selector.rb
 - lib/news_crawler/nc_logger.rb
+- lib/news_crawler/processing/structure_analysis.rb
 - lib/news_crawler/storage/raw_data.rb
 - lib/news_crawler/storage/raw_data/mongo_storage.rb
 - lib/news_crawler/storage/raw_data/raw_data_engine.rb
@@ -171,6 +172,9 @@ files:
 - lib/news_crawler/storage/url_queue/mongo_storage.rb
 - lib/news_crawler/storage/url_queue/url_queue_engine.rb
 - lib/news_crawler/storage/url_queue/url_queue_error.rb
+- lib/news_crawler/storage/yaml_stor.rb
+- lib/news_crawler/storage/yaml_stor/mongo_storage.rb
+- lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb
 - lib/news_crawler/url_helper.rb
 - lib/news_crawler/utils/robots_patch.rb
 - lib/news_crawler/default_config.yml
@@ -191,9 +195,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: 2.0.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - '>'
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 1.3.1
 requirements: []
 rubyforge_project:
 rubygems_version: 2.0.3