RubyGems - extractcontent - Versions diffs - 0.0.1 - Mend

extractcontent 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/History.txt +4 -0
data/License.txt +20 -0
data/Manifest.txt +24 -0
data/README.txt +48 -0
data/Rakefile +4 -0
data/config/hoe.rb +70 -0
data/config/requirements.rb +17 -0
data/lib/extractcontent/version.rb +9 -0
data/lib/extractcontent.rb +202 -0
data/log/debug.log +0 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/script/txt2html +74 -0
data/setup.rb +1585 -0
data/tasks/deployment.rake +34 -0
data/tasks/environment.rake +7 -0
data/tasks/website.rake +17 -0
data/test/test_extractcontent.rb +31 -0
data/test/test_helper.rb +2 -0
data/website/index.html +114 -0
data/website/index.txt +56 -0
data/website/javascripts/rounded_corners_lite.inc.js +285 -0
data/website/stylesheets/screen.css +138 -0
data/website/template.rhtml +48 -0
metadata +76 -0

data/History.txt ADDED Viewed

@@ -0,0 +1,4 @@
+== 0.0.1 2008-03-24
+* 1 major enhancement:
+  * Initial release

data/License.txt ADDED Viewed

@@ -0,0 +1,20 @@
+Copyright (c) 2007/2008 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/Manifest.txt ADDED Viewed

@@ -0,0 +1,24 @@
+History.txt
+License.txt
+Manifest.txt
+README.txt
+Rakefile
+config/hoe.rb
+config/requirements.rb
+lib/extractcontent.rb
+lib/extractcontent/version.rb
+log/debug.log
+script/destroy
+script/generate
+script/txt2html
+setup.rb
+tasks/deployment.rake
+tasks/environment.rake
+tasks/website.rake
+test/test_extractcontent.rb
+test/test_helper.rb
+website/index.html
+website/index.txt
+website/javascripts/rounded_corners_lite.inc.js
+website/stylesheets/screen.css
+website/template.rhtml

data/README.txt ADDED Viewed

@@ -0,0 +1,48 @@
+= extractcontent
+* FIX (url)
+== DESCRIPTION:
+FIX (describe your package)
+== FEATURES/PROBLEMS:
+* FIX (list of features or problems)
+== SYNOPSIS:
+  FIX (code sample of usage)
+== REQUIREMENTS:
+* FIX (list of requirements)
+== INSTALL:
+* FIX (sudo gem install, anything else)
+== LICENSE:
+(The MIT License)
+Copyright (c) 2008 FIX
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/Rakefile ADDED Viewed

@@ -0,0 +1,4 @@
+require 'config/requirements'
+require 'config/hoe' # setup Hoe + all gem configuration
+Dir['tasks/**/*.rake'].each { |rake| load rake }

data/config/hoe.rb ADDED Viewed

@@ -0,0 +1,70 @@
+require 'extractcontent/version'
+AUTHOR = 'Nakatani Shuyo'  # can also be an array of Authors
+EMAIL = "nakatani.shuyo@gmail.com"
+DESCRIPTION = "This module is to extract the text from web page(html)."
+GEM_NAME = 'extractcontent' # what ppl will type to install your gem
+RUBYFORGE_PROJECT = 'extractcontent' # The unix name for your project
+HOMEPATH = "http://#{RUBYFORGE_PROJECT}.rubyforge.org"
+DOWNLOAD_PATH = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
+@config_file = "~/.rubyforge/user-config.yml"
+@config = nil
+RUBYFORGE_USERNAME = "shuyo"
+def rubyforge_username
+  unless @config
+    begin
+      @config = YAML.load(File.read(File.expand_path(@config_file)))
+    rescue
+      puts <<-EOS
+ERROR: No rubyforge config file found: #{@config_file}
+Run 'rubyforge setup' to prepare your env for access to Rubyforge
+ - See http://newgem.rubyforge.org/rubyforge.html for more details
+      EOS
+      exit
+    end
+  end
+  RUBYFORGE_USERNAME.replace @config["username"]
+end
+REV = nil
+# UNCOMMENT IF REQUIRED:
+# REV = `svn info`.each {|line| if line =~ /^Revision:/ then k,v = line.split(': '); break v.chomp; else next; end} rescue nil
+VERS = ExtractContent::VERSION::STRING + (REV ? ".#{REV}" : "")
+RDOC_OPTS = ['--quiet', '--title', 'extractcontent documentation',
+    "--opname", "index.html",
+    "--line-numbers",
+    "--main", "README",
+    "--inline-source"]
+class Hoe
+  def extra_deps
+    @extra_deps.reject! { |x| Array(x).first == 'hoe' }
+    @extra_deps
+  end
+end
+# Generate all the Rake tasks
+# Run 'rake -T' to see list of generated tasks (from gem root directory)
+hoe = Hoe.new(GEM_NAME, VERS) do |p|
+  p.developer(AUTHOR, EMAIL)
+  p.description = DESCRIPTION
+  p.summary = DESCRIPTION
+  p.url = HOMEPATH
+  p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
+  p.test_globs = ["test/**/test_*.rb"]
+  p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store']  #An array of file patterns to delete on clean.
+  # == Optional
+  p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
+  #p.extra_deps = []     # An array of rubygem dependencies [name, version], e.g. [ ['active_support', '>= 1.3.1'] ]
+  #p.spec_extras = {}    # A hash of extra values to set in the gemspec.
+end
+CHANGES = hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
+PATH    = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
+hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
+hoe.rsync_args = '-av --delete --ignore-errors'

data/config/requirements.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require 'fileutils'
+include FileUtils
+require 'rubygems'
+%w[rake hoe newgem rubigen].each do |req_gem|
+  begin
+    require req_gem
+  rescue LoadError
+    puts "This Rakefile requires the '#{req_gem}' RubyGem."
+    puts "Installation: gem install #{req_gem} -y"
+    exit
+  end
+end
+$:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
+require 'extractcontent'

data/lib/extractcontent/version.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module ExtractContent #:nodoc:
+  module VERSION #:nodoc:
+    MAJOR = 0
+    MINOR = 0
+    TINY  = 1
+    STRING = [MAJOR, MINOR, TINY].join('.')
+  end
+end

data/lib/extractcontent.rb ADDED Viewed

@@ -0,0 +1,202 @@
+#!/usr/bin/ruby -Ku
+$KCODE="u"
+# Author:: Nakatani Shuyo
+# Copyright:: (c)2007/2008 Cybozu Labs Inc. All rights reserved.
+# License:: BSD
+# = ExtractContent : Extract Content Module for html
+# This module is to extract the text from web page ( html content ).
+# Automatically extracts sub blocks of html which have much possibility that it is the text
+# ( except for menu, comments, navigations, affiliate links and so on ).
+# == PROCESSES
+# - separating blocks from html, calculating score of blocks and ignoring low score blocks.
+# - for score calculation, using block arrangement, text length, whether has affiliate links or characteristic keywords
+# - clustering continuous, high score blocks and comparing amang clusters
+# - if including "Google AdSense Section Target", noticing it in particular
+module ExtractContent
+  # onvert from character entity references
+  CHARREF = {
+    '&nbsp;' => ' ',
+    '&lt;'   => '<',
+    '&gt;'   => '>',
+    '&amp;'  => '&',
+    '&laquo;'=> "\xc2\xab",
+    '&raquo;'=> "\xc2\xbb",
+  }
+  # Default option parameters.
+  DEFAULT = {
+		:threshold => 100,																				# threhold for score of the text
+		:min_length => 80,																				# minimum length of evaluated blocks
+		:decay_factor => 0.73,																		# decay factor for block score
+		:continuous_factor => 1.62, 															# continuous factor for block score ( the larger, the harder to continue )
+		:punctuation_weight => 10,																# score weight for punctuations
+		:punctuations => /(\343\200[\201\202]|\357\274[\201\214\216\237]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/,
+																															# punctuation characters
+		:waste_expressions => /Copyright|All Rights Reserved/i, 	# characteristic keywords including footer
+		:debug => false,																					# if true, output block information to stdout
+  }
+  class Extractor < Module
+    # Sets option parameters to default.
+    # Parameter opt is given as Hash
+    def initialize(opt=nil)
+      @default = DEFAULT.clone
+      @default.update(opt) if opt
+    end
+    # Analyses the given HTML text, extracts body and title.
+    def analyse(html, opt=nil)
+      # frameset or redirect
+      return ["", extract_title(html)] if html =~ /<\/frameset>|<meta\s+http-equiv\s*=\s*["']?refresh['"]?[^>]*url/i
+      # option parameters
+      opt = if opt then @default.merge(opt) else @default end
+      # header & title
+      title = if html =~ /<\/head\s*>/im
+        html = $' #'
+        extract_title($`)
+      else
+        extract_title(html)
+      end
+      # Google AdSense Section Target
+      html.gsub!(/<!--\s*google_ad_section_start\(weight=ignore\)\s*-->.*?<!--\s*google_ad_section_end.*?-->/m, '')
+      if html =~ /<!--\s*google_ad_section_start[^>]*-->/n
+        html = html.scan(/<!--\s*google_ad_section_start[^>]*-->.*?<!--\s*google_ad_section_end.*?-->/mn).join("\n")
+      end
+      # eliminate useless text
+      html = eliminate_useless_tags(html)
+      # heading tags including title
+      html.gsub!(/(<h\d\s*>\s*(.*?)\s*<\/h\d\s*>)/is) do |m|
+        if $2.length >= 3 && title.include?($2) then "<div>#{$2}</div>" else $1 end
+      end
+      # extract text blocks
+      factor = continuous = 1.0
+      body = ''
+      score = 0
+      bodylist = []
+      list = html.split(/<\/?(?:div|center|td)[^>]*>|<p\s*[^>]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/)
+      list.each do |block|
+        next unless block
+        block.strip!
+        next if has_only_tags(block)
+        continuous /= opt[:continuous_factor] if body.length > 0
+        # ignore link list block
+        notlinked = eliminate_link(block)
+        next if notlinked.length < opt[:min_length]
+        # calculate score of block
+        c = (notlinked.length + notlinked.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor
+        factor *= opt[:decay_factor]
+        not_body_rate = block.scan(opt[:waste_expressions]).length + block.scan(/amazon[a-z0-9\.\/\-\?&]+-22/i).length / 2.0
+        c *= (0.72 ** not_body_rate) if not_body_rate>0
+        c1 = c * continuous
+        puts "----- #{c}*#{continuous}=#{c1} #{notlinked.length} \n#{strip_tags(block)[0,100]}\n" if opt[:debug]
+        # treat continuous blocks as cluster
+        if c1 > opt[:threshold]
+          body += block + "\n"
+          score += c1
+          continuous = opt[:continuous_factor]
+        elsif c > opt[:threshold] # continuous block end
+          bodylist << [body, score]
+          body = block + "\n"
+          score = c
+          continuous = opt[:continuous_factor]
+        end
+      end
+      bodylist << [body, score]
+      body = bodylist.inject{|a,b| if a[1]>=b[1] then a else b end }
+      [strip_tags(body[0]), title]
+    end
+    # Extracts title.
+    def extract_title(st)
+      if st =~ /<title[^>]*>\s*(.*?)\s*<\/title\s*>/im
+        strip_tags($1)
+      else
+        ""
+      end
+    end
+    private
+    # Eliminates useless tags
+    def eliminate_useless_tags(html)
+      # eliminate useless symbols
+      html.gsub!(/\342(?:\200[\230-\235]|\206[\220-\223]|\226[\240-\275]|\227[\206-\257]|\230[\205\206])/,'')
+      # eliminate useless html tags
+      html.gsub!(/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/imn, '')
+      html.gsub!(/<!--.*?-->/m, '')
+      html.gsub!(/<![A-Za-z].*?>/s, '')
+      html.gsub!(/<div\s[^>]*class\s*=\s*['"]?alpslab-slide["']?[^>]*>.*?<\/div\s*>/m, '')
+      html.gsub!(/<div\s[^>]*(id|class)\s*=\s*['"]?\S*more\S*["']?[^>]*>/is, '')
+      html
+    end
+    # Checks if the given block has only tags without text.
+    def has_only_tags(st)
+      st.gsub(/<[^>]*>/imn, '').gsub("&nbsp;",'').strip.length == 0
+    end
+    # eliminates link tags
+    def eliminate_link(html)
+      count = 0
+      notlinked = html.gsub(/<a\s[^>]*>.*?<\/a\s*>/imn){count+=1;''}.gsub(/<form\s[^>]*>.*?<\/form\s*>/imn, '')
+      notlinked = strip_tags(notlinked)
+      return "" if notlinked.length < 20 * count || islinklist(html)
+      return notlinked
+    end
+    # determines whether a block is link list or not
+    def islinklist(st)
+      if st=~/<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/ism
+        listpart = $1
+        outside = st.gsub(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/ismn, '').gsub(/<.+?>/mn, '').gsub(/\s+/, ' ')
+        list = listpart.split(/<li[^>]*>/)
+        list.shift
+        rate = evaluate_list(list)
+        outside.length <= st.length / (45 / rate)
+      end
+    end
+    # estimates how much degree of link list
+    def evaluate_list(list)
+      return 1 if list.length == 0
+      hit = 0
+      list.each do |line|
+        hit +=1 if line =~ /<a\s+href=(['"]?)([^"'\s]+)\1/imn
+      end
+      return 9 * (1.0 * hit / list.length) ** 2 + 1
+    end
+    # Strips tags from html.
+    def strip_tags(html)
+      st = html.gsub(/<.+?>/m, '')
+      # Convert from wide character to ascii
+      st.gsub!(/\357\274([\201-\272])/){($1[0]-96).chr} # symbols, 0-9, A-Z
+      st.gsub!(/\357\275([\201-\232])/){($1[0]-32).chr} # a-z
+      st.gsub!(/\342[\224\225][\200-\277]/, '') # keisen
+      st.gsub!(/\343\200\200/, ' ')
+      CHARREF.each{|ref, c| st.gsub!(ref, c) }
+      require 'cgi'
+      st = CGI.unescapeHTML(st)
+      st.gsub(/[ \t]+/, " ")
+      st.gsub(/\n\s*/, "\n")
+    end
+  end
+  include Extractor.new
+end

data/log/debug.log ADDED Viewed

File without changes

data/script/destroy ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+begin
+  require 'rubigen'
+rescue LoadError
+  require 'rubygems'
+  require 'rubigen'
+end
+require 'rubigen/scripts/destroy'
+ARGV.shift if ['--help', '-h'].include?(ARGV[0])
+RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
+RubiGen::Scripts::Destroy.new.run(ARGV)

data/script/generate ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+begin
+  require 'rubigen'
+rescue LoadError
+  require 'rubygems'
+  require 'rubigen'
+end
+require 'rubigen/scripts/generate'
+ARGV.shift if ['--help', '-h'].include?(ARGV[0])
+RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
+RubiGen::Scripts::Generate.new.run(ARGV)