RubyGems - news_crawler - Versions diffs - 0.0.3 → 0.0.4.pre.1 - Mend

news_crawler 0.0.3 → 0.0.4.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/news_crawler/crawler_module.rb +21 -1
data/lib/news_crawler/default_config.yml +3 -2
data/lib/news_crawler/processing/structure_analysis.rb +299 -0
data/lib/news_crawler/storage/yaml_stor/mongo_storage.rb +85 -0
data/lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb +70 -0
data/lib/news_crawler/storage/yaml_stor.rb +78 -0
metadata +8 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 50f3793b5c9e31eeaba0d7650ddf20d451de16c2
-  data.tar.gz: 5796887e83aa9912b02fa67f69bb1b1f5f34ecec
+  metadata.gz: 37f387711eea761bdcaa5cf16f30eb295b60652e
+  data.tar.gz: 127ab77d72f429a3bacf46a24367331eaadf471f
 SHA512:
-  metadata.gz: 7d97e8b6630aebad685d9c35b848f575eb293e691a3e42e1cb4c86bee1920df27893fdb51d075dad92dbd029a5eae25e26ee1a5ab8ab6f51a31078fbef479caf
-  data.tar.gz: 08486762b1261bd0a5b950e63084a216bfd2a710b15c54bceec1d604dcfe77088da2b9265f9a99093281009427d2eb7dd5766592513b54255f75dcf28c16bc97
+  metadata.gz: 278f80bc1f4eec78a2536ee74edebfe94b26b45a15c9577bedb7214325ad4cdeda6e64eaae002e7f5c48480fe4eaefb413e4e00bef518915de1fb8cfbb08321d
+  data.tar.gz: c57baf50222284a38769636c2cb4cc57e075bf8c008f32765c9dc92a2992afe6eb8d5eb32b6d5e04bf5695167c55be40c74eed4b981462d2a9a27959cc852c3b

data/lib/news_crawler/crawler_module.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 #--
 # NewsCrawler - a website crawler
 #
@@ -20,7 +21,8 @@
 #++
 require 'news_crawler/storage/url_queue'
-require 'thread'
+require 'news_crawler/storage/yaml_stor'
 module NewsCrawler
   # Include this to get basic module methods
@@ -66,5 +68,23 @@ module NewsCrawler
     def next_unprocessed(max_depth = -1)
       URLQueue.next_unprocessed(self.class.name, max_depth)
     end
+    def mark_all_as_unprocessed
+      URLQueue.mark_all(self.class.name, URLQueue::UNPROCESSED)
+    end
+    # Serialize object to YAML and save it (overwrite if key existed)
+    # @param [ String ] key
+    # @param [ Object ] value
+    def save_yaml(key, value)
+      YAMLStor.add(self.class.name, key, value)
+    end
+    # Load YAML object
+    # @param  [ String ]      key
+    # @return [ Object, nil ]
+    def load_yaml(key, value)
+      YAMLStor.get(self.class.name, key, value)
+    end
   end
 end

data/lib/news_crawler/default_config.yml CHANGED Viewed

@@ -1,4 +1,4 @@
-db:
+:db:
   :engine: :mongo
 :mongodb:
@@ -9,5 +9,6 @@ db:
 :suffix:
   :raw_data: raw_data
   :url_queue: url_queue
+  :yaml: yaml
-prefix: ''
+:prefix: ''

data/lib/news_crawler/processing/structure_analysis.rb ADDED Viewed

@@ -0,0 +1,299 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'nokogiri'
+require 'uri'
+require 'news_crawler/url_helper'
+require 'news_crawler/storage/url_queue'
+require 'news_crawler/storage/raw_data'
+require 'news_crawler/crawler_module'
+require 'news_crawler/nc_logger'
+module NewsCrawler
+  module Processing
+    # Analyse website structure to extract content
+    # Database should only contains raw data from one website.
+    class StructureAnalysis
+      include CrawlerModule
+      include URLHelper
+      def initialize
+        @url_stats = {}
+        while (url = next_unprocessed)
+          NCLogger.get_logger.info "[NC::P::SA] Processing #{url}"
+          re = extract_content(url)
+          @url_stats[url] = re
+          save_yaml(url, re)
+        end
+      end
+      def extract_content(url)
+        html_doc = RawData.find_by_url(url)
+        result = {}
+        result[:type] == :article
+        # Remove tag causing trouble to nokogiri
+        html_doc = remove_tag(html_doc, 'script')
+        html_doc = remove_tag(html_doc, 'iframe')
+        html_doc = remove_tag(html_doc, 'style')
+        doc = Nokogiri::HTML.parse(html_doc)
+        longest = find_longest_node(doc)
+        lowest_ancestor, path_to_longest = find_lowest_ancestor_has_id(longest)
+        # Heuristic 1
+        # Longest content is a element as id attribute
+        if path_to_longest.length == 2
+          return { :type => :list }
+        end
+        parent = path_to_longest[1..-1]
+        parent = parent.reverse
+        xpath_path = parent.join('/')
+        xpath_path = '//' + xpath_path + '//text()'
+        guest_type = classify_h2(longest, lowest_ancestor)
+        result = { :type => guest_type }
+        if (result[:type] == :article)
+          title_ = lowest_ancestor.css('h1')
+          if title_.count == 1
+            result[:title] = title_.to_a[0].content
+          else
+            # if cann't guest title then assume it isn't an article
+            result[:type] = :list
+          end
+          main_content = ''
+          lowest_ancestor.xpath(xpath_path).each do | node |
+            main_content += node.content
+          end
+          result[:content] = main_content
+        end
+        mark_processed(url)
+        result
+      end
+      # Predict type of tree point by root is fragment of article or index page
+      # @param [ Nokogiri::XML::Node ] root
+      # @paran [ Nokogiri::XML::Node ] limit limit node to search backward
+      # @return [ Symbol ] one of :article, :list
+      def classify_h2(root, limit)
+        current = root
+        current = current.parent if current.text?
+        depth = 0
+        while true
+          expect_hash = hash_node(current, 0)
+          previous = current
+          current = current.parent
+          depth += 1
+          lons = {}
+          node_count = 0
+          node_list = [previous]
+          current.children.each do | child |
+            hc = hash_node(child, depth - 1)
+            if hc == expect_hash
+              node_count += 1
+              node_list << child
+            end
+          end
+          if node_count > 1
+            a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
+            if non_a_tag_len > a_tag_len
+              return :article
+            else
+              return :list
+            end
+            break
+          end
+          if current == limit
+            a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
+            if non_a_tag_len > a_tag_len
+              return :article
+            else
+              return :list
+            end
+            break
+          end
+        end
+        return :list
+      end
+      # Count a tag and non-a tag in tree pointed by node
+      # @param [ Nokogiri::XML::Node ] node
+      # @return [ [Fixnum, Fixnum] ] a tag and non-a tag
+      def count_a_and_non_a_tag(node)
+        a_tag_list = node.xpath './/a'
+        a_tag_len = a_tag_list.count # number of a tag
+        non_a_tag_list = node.xpath './/text()[not (ancestor::a)]'
+        non_a_tag_len = non_a_tag_list.to_a.inject(0) do | memo, node |
+          if node.content.gsub(/\s+/, '').length > 15
+            memo + 1
+          else
+            memo
+          end
+        end
+        [ a_tag_len, non_a_tag_len ]
+      end
+      # Find the lowest node's ancestor has id attribute
+      # @param [ Nokogiri::XML::Node ] node
+      # @return [ Nokogiri::XML::Node ]
+      def find_lowest_ancestor_has_id(node)
+        found_id = false
+        closest_ancestor = node
+        path_to_closest = []
+        while (!found_id)
+          if closest_ancestor.has_attribute?('id')
+            path_to_closest << "#{closest_ancestor.node_name}[@id='#{closest_ancestor.attribute('id')}']"
+            found_id = true
+          else
+            if closest_ancestor.has_attribute?('class')
+              node_class = "@class = '#{closest_ancestor.attribute('class')}'"
+            else
+              node_class = 'not(@class)'
+            end
+            path_to_closest << "#{closest_ancestor.node_name}[#{node_class}]"
+            closest_ancestor = closest_ancestor.parent
+          end
+        end
+        return [ closest_ancestor, path_to_closest ]
+      end
+      # Find longest text node that doesn't have a in ancestors list
+      # @param [ Nokogiri::XML::Node ] doc
+      def find_longest_node(doc)
+        xpath_query = '//*[@id]//text()[not (ancestor::a)]'
+        a_l = doc.xpath xpath_query
+        longest = nil
+        longest_len = 0
+        a_l.each do | en |
+          node_content_wo_space = en.content.gsub(/\s/, '') # trick here
+          if node_content_wo_space.length > longest_len
+            longest_len = node_content_wo_space.length
+            longest = en
+          end
+        end
+        return longest
+      end
+      # Remove unwanted HTML tag
+      # @param [ String ] html_doc HTML document
+      # @param [ String ] tag tag to be removed
+      def remove_tag(html_doc, tag)
+        pattern = Regexp.new("<#{tag}.*?>.*?</#{tag}>", Regexp::MULTILINE)
+        html_doc.gsub(pattern, '')
+      end
+      # Return String represents node's name, node's id and node's class
+      # @param [ Nokogiri::XML::Node ] node
+      # @return [ String ]
+      def node_info(node)
+        node_pp = node.node_name
+        node_pp += '#' + node.attribute('id') if node.has_attribute?('id')
+        node_pp += '.' + node.attribute('class') if node.has_attribute?('class')
+        node_pp
+      end
+      # Calculate hash of a node by its and children info
+      # @param [ Nokogiri::XML::Node ] node
+      # @param [ Fixnum ] limit limit depth of children (-1 for unlimited)
+      # @return [ String ] Hash of node in base 64 encode
+      def hash_node(node, limit = -1)
+        node_sign = node.node_name
+        node_sign += "##{node['id']}" unless node['id'].nil?
+        node_sign += ".#{node['class']}" unless node['class'].nil?
+        hash_sum = node_sign
+        if limit != 0
+          child_hash = Set.new
+          node.children.each do | child_node |
+            child_hash.add(hash_node(child_node, limit - 1))
+          end
+          child_hash.each do | ch |
+            hash_sum += ch
+          end
+        else
+        end
+        Digest::SHA2.new.base64digest(hash_sum)
+      end
+      # Get and analyse url for information
+      def analyse(url)
+        #        puts "processing #{url}"
+        html_doc = RawData.find_by_url(url)
+        doc = Nokogiri.HTML(html_doc)
+        inner_url = doc.xpath('//a').collect { | a_el |
+          temp_url = (a_el.attribute 'href').to_s
+          if (!temp_url.nil?) && (temp_url[0] == '/')
+            temp_url = URI.join(url, temp_url).to_s
+          end
+          temp_url
+        }
+        inner_url.delete_if { | url_0 |
+          (url_0.nil?) || (url_0.size == 0) || (url_0 == '#') ||
+          (url_0 == 'javascript:;')
+        }
+        inner_url.each do  | url |
+          @url_stats[url] = (@url_stats[url] || 0) + 1
+        end
+        mark_processed(url)
+      end
+      # Check if it is really 'url'
+      # @param [ String ] url
+      # @return [ Boolean ]
+      def is_url?(url)
+        (url.size != 0) && (url != '#') && (url != 'javascript:;')
+      end
+      def get_result
+        @url_stats
+      end
+    end
+  end
+end

data/lib/news_crawler/storage/yaml_stor/mongo_storage.rb ADDED Viewed

@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'mongo'
+require 'yaml'
+require 'simple_config'
+require 'news_crawler/storage/yaml_stor/yaml_stor_engine'
+require 'news_crawler/nc_logger'
+module NewsCrawler
+  module Storage
+    module YAMLStor
+      # YAML storage implement using MongoDB
+      class MongoStorage < NewsCrawler::Storage::YAMLStor::YAMLStorEngine
+        NAME = 'mongo'
+        include Mongo
+        def initialize(*opts)
+          config = (SimpleConfig.for :application)
+          client = MongoClient.new(config.mongodb.host, config.mongodb.port)
+          db = client[config.mongodb.db_name]
+          @coll = db[config.prefix + '_' + config.suffix.yaml]
+          # @coll.ensure_index({:key => Mongo::ASCENDING}, {:unique => true})
+        end
+        # Add entry to yaml collection, overwrite old data
+        # @param [ String ] module_name
+        # @param [ String ] key
+        # @param [ Object ] value YAML string
+        def add(module_name, key, value)
+          yaml_str = value.to_yaml
+          yaml_str.encode!('utf-8', :invalid => :replace, :undef => :replace)
+          @coll.update({:key   => key,
+                         :m_name => module_name},
+                       {:$set  => {:value => yaml_str}},
+                       {:upsert => true})
+        end
+        # Find document with correspond key
+        # @param  [ String ] module_name
+        # @param  [ String      ] key
+        # @return [ Object, nil ]
+        def get(module_name, key)
+          result = @coll.find_one({:key => key,
+                                    :m_name => module_name})
+          if (!result.nil?)
+            YAML.load(result['value'])
+          else
+            nil
+          end
+        end
+        # Get number of raw data entries
+        def count
+          @coll.count
+        end
+        def clear
+          @coll.remove
+        end
+      end
+    end
+  end
+end

data/lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb ADDED Viewed

@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+module NewsCrawler
+  module Storage
+    module YAMLStor
+      # Basic class for YAMLStor engine.
+      # Subclass and implement all its method to create new YAMLStor engine,
+      # you should keep methods' singature unchanged
+      class YAMLStorEngine
+        def self.inherited(klass)
+          @engine_list = (@engine_list || []) + [klass]
+        end
+        # Get engine list
+        # @return [ Array ] list of url queue engines
+        def self.get_engines
+          @engine_list = @engine_list || []
+          @engine_list.inject({}) do | memo, klass |
+            memo[klass::NAME.intern] = klass
+            memo
+          end
+        end
+        # Add entry to raw data collection
+        # @param [ String ] module_name
+        # @param [ String ] key
+        # @param [ Object ] value
+        def add(module_name, key, value)
+          raise NotImplementedError
+        end
+        # Get entry to raw data collection
+        # @param [ String ] module_name
+        # @param [ String ] key
+        # @return [ Object, nil ] Value or nil if key isn't found
+        def get(module_name, key)
+          raise NotImplementedError
+        end
+        def count
+          raise NotImplementedError
+        end
+        def clear
+          raise NotImplementedError
+        end
+      end
+    end
+  end
+end

data/lib/news_crawler/storage/yaml_stor.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+#--
+# NewsCrawler - a website crawler
+#
+# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
+#
+# This file is part of NewsCrawler.
+#
+# NewsCrawler is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# NewsCrawler is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with NewsCrawler.  If not, see <http://www.gnu.org/licenses/>.
+#++
+require 'simpleconfig'
+#!!!
+require 'news_crawler/storage/yaml_stor/yaml_stor_engine'
+require 'news_crawler/storage/yaml_stor/mongo_storage'
+module NewsCrawler
+  module Storage
+    # YAML data storage
+    # You can use it for store processed data or configuration
+    module YAMLStor
+      class << self
+        # Set YAMLStor storage engine
+        # @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
+        # @param [ Hash           ] opts options pass to engine
+        #   This can be
+        #   * `:mongo`, `:mongodb` for MongoDB backend
+        def set_engine(engine, *opts)
+          if engine.respond_to? :intern
+            engine = engine.intern
+          end
+          engine_class = YAMLStorEngine.get_engines[engine]
+          if engine_class
+            @engine = engine_class.new(*opts)
+          else
+            @engine = engine
+          end
+        end
+        # Add entry to YAML storage
+        # @param [ String ] module_name
+        # @param [ String ] key
+        # @param [ String ] value object to serialize
+        def add(module_name, key, value)
+          @engine.add(module_name, key, value)
+        end
+        # Find document with correspond key
+        # @param  [ String ]      module_name
+        # @param  [ String ]      key
+        # @return [ Object, nil ]
+        def get(module_name, key)
+          @engine.get(module_name, key)
+        end
+        def count
+          @engine.count
+        end
+        def clear
+          @engine.clear
+        end
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: news_crawler
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4.pre.1
 platform: ruby
 authors:
 - Hà Quang Dương
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-28 00:00:00.000000000 Z
+date: 2013-08-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mongo
@@ -164,6 +164,7 @@ files:
 - lib/news_crawler/downloader.rb
 - lib/news_crawler/link_selector/same_domain_selector.rb
 - lib/news_crawler/nc_logger.rb
+- lib/news_crawler/processing/structure_analysis.rb
 - lib/news_crawler/storage/raw_data.rb
 - lib/news_crawler/storage/raw_data/mongo_storage.rb
 - lib/news_crawler/storage/raw_data/raw_data_engine.rb
@@ -171,6 +172,9 @@ files:
 - lib/news_crawler/storage/url_queue/mongo_storage.rb
 - lib/news_crawler/storage/url_queue/url_queue_engine.rb
 - lib/news_crawler/storage/url_queue/url_queue_error.rb
+- lib/news_crawler/storage/yaml_stor.rb
+- lib/news_crawler/storage/yaml_stor/mongo_storage.rb
+- lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb
 - lib/news_crawler/url_helper.rb
 - lib/news_crawler/utils/robots_patch.rb
 - lib/news_crawler/default_config.yml
@@ -191,9 +195,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: 2.0.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - '>'
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 1.3.1
 requirements: []
 rubyforge_project:
 rubygems_version: 2.0.3