extractcontent 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,4 @@
1
+ == 0.0.1 2008-03-24
2
+
3
+ * 1 major enhancement:
4
+ * Initial release
data/License.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2007/2008 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Manifest.txt ADDED
@@ -0,0 +1,24 @@
1
+ History.txt
2
+ License.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ config/hoe.rb
7
+ config/requirements.rb
8
+ lib/extractcontent.rb
9
+ lib/extractcontent/version.rb
10
+ log/debug.log
11
+ script/destroy
12
+ script/generate
13
+ script/txt2html
14
+ setup.rb
15
+ tasks/deployment.rake
16
+ tasks/environment.rake
17
+ tasks/website.rake
18
+ test/test_extractcontent.rb
19
+ test/test_helper.rb
20
+ website/index.html
21
+ website/index.txt
22
+ website/javascripts/rounded_corners_lite.inc.js
23
+ website/stylesheets/screen.css
24
+ website/template.rhtml
data/README.txt ADDED
@@ -0,0 +1,48 @@
1
+ = extractcontent
2
+
3
+ * FIX (url)
4
+
5
+ == DESCRIPTION:
6
+
7
+ FIX (describe your package)
8
+
9
+ == FEATURES/PROBLEMS:
10
+
11
+ * FIX (list of features or problems)
12
+
13
+ == SYNOPSIS:
14
+
15
+ FIX (code sample of usage)
16
+
17
+ == REQUIREMENTS:
18
+
19
+ * FIX (list of requirements)
20
+
21
+ == INSTALL:
22
+
23
+ * FIX (sudo gem install, anything else)
24
+
25
+ == LICENSE:
26
+
27
+ (The MIT License)
28
+
29
+ Copyright (c) 2008 FIX
30
+
31
+ Permission is hereby granted, free of charge, to any person obtaining
32
+ a copy of this software and associated documentation files (the
33
+ 'Software'), to deal in the Software without restriction, including
34
+ without limitation the rights to use, copy, modify, merge, publish,
35
+ distribute, sublicense, and/or sell copies of the Software, and to
36
+ permit persons to whom the Software is furnished to do so, subject to
37
+ the following conditions:
38
+
39
+ The above copyright notice and this permission notice shall be
40
+ included in all copies or substantial portions of the Software.
41
+
42
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
43
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
45
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
46
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
47
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
48
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ require 'config/requirements'
2
+ require 'config/hoe' # setup Hoe + all gem configuration
3
+
4
+ Dir['tasks/**/*.rake'].each { |rake| load rake }
data/config/hoe.rb ADDED
@@ -0,0 +1,70 @@
1
+ require 'extractcontent/version'
2
+
3
+ AUTHOR = 'Nakatani Shuyo' # can also be an array of Authors
4
+ EMAIL = "nakatani.shuyo@gmail.com"
5
+ DESCRIPTION = "This module is to extract the text from web page(html)."
6
+ GEM_NAME = 'extractcontent' # what ppl will type to install your gem
7
+ RUBYFORGE_PROJECT = 'extractcontent' # The unix name for your project
8
+ HOMEPATH = "http://#{RUBYFORGE_PROJECT}.rubyforge.org"
9
+ DOWNLOAD_PATH = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
10
+
11
+ @config_file = "~/.rubyforge/user-config.yml"
12
+ @config = nil
13
+ RUBYFORGE_USERNAME = "shuyo"
14
+ def rubyforge_username
15
+ unless @config
16
+ begin
17
+ @config = YAML.load(File.read(File.expand_path(@config_file)))
18
+ rescue
19
+ puts <<-EOS
20
+ ERROR: No rubyforge config file found: #{@config_file}
21
+ Run 'rubyforge setup' to prepare your env for access to Rubyforge
22
+ - See http://newgem.rubyforge.org/rubyforge.html for more details
23
+ EOS
24
+ exit
25
+ end
26
+ end
27
+ RUBYFORGE_USERNAME.replace @config["username"]
28
+ end
29
+
30
+
31
+ REV = nil
32
+ # UNCOMMENT IF REQUIRED:
33
+ # REV = `svn info`.each {|line| if line =~ /^Revision:/ then k,v = line.split(': '); break v.chomp; else next; end} rescue nil
34
+ VERS = ExtractContent::VERSION::STRING + (REV ? ".#{REV}" : "")
35
+ RDOC_OPTS = ['--quiet', '--title', 'extractcontent documentation',
36
+ "--opname", "index.html",
37
+ "--line-numbers",
38
+ "--main", "README",
39
+ "--inline-source"]
40
+
41
+ class Hoe
42
+ def extra_deps
43
+ @extra_deps.reject! { |x| Array(x).first == 'hoe' }
44
+ @extra_deps
45
+ end
46
+ end
47
+
48
+ # Generate all the Rake tasks
49
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
50
+ hoe = Hoe.new(GEM_NAME, VERS) do |p|
51
+ p.developer(AUTHOR, EMAIL)
52
+ p.description = DESCRIPTION
53
+ p.summary = DESCRIPTION
54
+ p.url = HOMEPATH
55
+ p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
56
+ p.test_globs = ["test/**/test_*.rb"]
57
+ p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store'] #An array of file patterns to delete on clean.
58
+
59
+ # == Optional
60
+ p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
61
+ #p.extra_deps = [] # An array of rubygem dependencies [name, version], e.g. [ ['active_support', '>= 1.3.1'] ]
62
+
63
+ #p.spec_extras = {} # A hash of extra values to set in the gemspec.
64
+
65
+ end
66
+
67
+ CHANGES = hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
68
+ PATH = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
69
+ hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
70
+ hoe.rsync_args = '-av --delete --ignore-errors'
@@ -0,0 +1,17 @@
1
+ require 'fileutils'
2
+ include FileUtils
3
+
4
+ require 'rubygems'
5
+ %w[rake hoe newgem rubigen].each do |req_gem|
6
+ begin
7
+ require req_gem
8
+ rescue LoadError
9
+ puts "This Rakefile requires the '#{req_gem}' RubyGem."
10
+ puts "Installation: gem install #{req_gem} -y"
11
+ exit
12
+ end
13
+ end
14
+
15
+ $:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
16
+
17
+ require 'extractcontent'
@@ -0,0 +1,9 @@
1
+ module ExtractContent #:nodoc:
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 0
5
+ TINY = 1
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
@@ -0,0 +1,202 @@
1
+ #!/usr/bin/ruby -Ku
2
+ $KCODE="u"
3
+
4
+ # Author:: Nakatani Shuyo
5
+ # Copyright:: (c)2007/2008 Cybozu Labs Inc. All rights reserved.
6
+ # License:: BSD
7
+
8
+ # = ExtractContent : Extract Content Module for html
9
+ # This module is to extract the text from web page ( html content ).
10
+ # Automatically extracts sub blocks of html which have much possibility that it is the text
11
+ # ( except for menu, comments, navigations, affiliate links and so on ).
12
+
13
+ # == PROCESSES
14
+ # - separating blocks from html, calculating score of blocks and ignoring low score blocks.
15
+ # - for score calculation, using block arrangement, text length, whether has affiliate links or characteristic keywords
16
+ # - clustering continuous, high score blocks and comparing amang clusters
17
+ # - if including "Google AdSense Section Target", noticing it in particular
18
+
19
+ module ExtractContent
20
+ # onvert from character entity references
21
+ CHARREF = {
22
+ '&nbsp;' => ' ',
23
+ '&lt;' => '<',
24
+ '&gt;' => '>',
25
+ '&amp;' => '&',
26
+ '&laquo;'=> "\xc2\xab",
27
+ '&raquo;'=> "\xc2\xbb",
28
+ }
29
+
30
+ # Default option parameters.
31
+ DEFAULT = {
32
+ :threshold => 100, # threhold for score of the text
33
+ :min_length => 80, # minimum length of evaluated blocks
34
+ :decay_factor => 0.73, # decay factor for block score
35
+ :continuous_factor => 1.62, # continuous factor for block score ( the larger, the harder to continue )
36
+ :punctuation_weight => 10, # score weight for punctuations
37
+ :punctuations => /(\343\200[\201\202]|\357\274[\201\214\216\237]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/,
38
+ # punctuation characters
39
+ :waste_expressions => /Copyright|All Rights Reserved/i, # characteristic keywords including footer
40
+ :debug => false, # if true, output block information to stdout
41
+ }
42
+
43
+ class Extractor < Module
44
+
45
+ # Sets option parameters to default.
46
+ # Parameter opt is given as Hash
47
+ def initialize(opt=nil)
48
+ @default = DEFAULT.clone
49
+ @default.update(opt) if opt
50
+ end
51
+
52
+ # Analyses the given HTML text, extracts body and title.
53
+ def analyse(html, opt=nil)
54
+ # frameset or redirect
55
+ return ["", extract_title(html)] if html =~ /<\/frameset>|<meta\s+http-equiv\s*=\s*["']?refresh['"]?[^>]*url/i
56
+
57
+ # option parameters
58
+ opt = if opt then @default.merge(opt) else @default end
59
+
60
+ # header & title
61
+ title = if html =~ /<\/head\s*>/im
62
+ html = $' #'
63
+ extract_title($`)
64
+ else
65
+ extract_title(html)
66
+ end
67
+
68
+ # Google AdSense Section Target
69
+ html.gsub!(/<!--\s*google_ad_section_start\(weight=ignore\)\s*-->.*?<!--\s*google_ad_section_end.*?-->/m, '')
70
+ if html =~ /<!--\s*google_ad_section_start[^>]*-->/n
71
+ html = html.scan(/<!--\s*google_ad_section_start[^>]*-->.*?<!--\s*google_ad_section_end.*?-->/mn).join("\n")
72
+ end
73
+
74
+ # eliminate useless text
75
+ html = eliminate_useless_tags(html)
76
+
77
+ # heading tags including title
78
+ html.gsub!(/(<h\d\s*>\s*(.*?)\s*<\/h\d\s*>)/is) do |m|
79
+ if $2.length >= 3 && title.include?($2) then "<div>#{$2}</div>" else $1 end
80
+ end
81
+
82
+ # extract text blocks
83
+ factor = continuous = 1.0
84
+ body = ''
85
+ score = 0
86
+ bodylist = []
87
+ list = html.split(/<\/?(?:div|center|td)[^>]*>|<p\s*[^>]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/)
88
+ list.each do |block|
89
+ next unless block
90
+ block.strip!
91
+ next if has_only_tags(block)
92
+ continuous /= opt[:continuous_factor] if body.length > 0
93
+
94
+ # ignore link list block
95
+ notlinked = eliminate_link(block)
96
+ next if notlinked.length < opt[:min_length]
97
+
98
+ # calculate score of block
99
+ c = (notlinked.length + notlinked.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor
100
+ factor *= opt[:decay_factor]
101
+ not_body_rate = block.scan(opt[:waste_expressions]).length + block.scan(/amazon[a-z0-9\.\/\-\?&]+-22/i).length / 2.0
102
+ c *= (0.72 ** not_body_rate) if not_body_rate>0
103
+ c1 = c * continuous
104
+ puts "----- #{c}*#{continuous}=#{c1} #{notlinked.length} \n#{strip_tags(block)[0,100]}\n" if opt[:debug]
105
+
106
+ # treat continuous blocks as cluster
107
+ if c1 > opt[:threshold]
108
+ body += block + "\n"
109
+ score += c1
110
+ continuous = opt[:continuous_factor]
111
+ elsif c > opt[:threshold] # continuous block end
112
+ bodylist << [body, score]
113
+ body = block + "\n"
114
+ score = c
115
+ continuous = opt[:continuous_factor]
116
+ end
117
+ end
118
+ bodylist << [body, score]
119
+ body = bodylist.inject{|a,b| if a[1]>=b[1] then a else b end }
120
+ [strip_tags(body[0]), title]
121
+ end
122
+
123
+ # Extracts title.
124
+ def extract_title(st)
125
+ if st =~ /<title[^>]*>\s*(.*?)\s*<\/title\s*>/im
126
+ strip_tags($1)
127
+ else
128
+ ""
129
+ end
130
+ end
131
+
132
+ private
133
+
134
+ # Eliminates useless tags
135
+ def eliminate_useless_tags(html)
136
+ # eliminate useless symbols
137
+ html.gsub!(/\342(?:\200[\230-\235]|\206[\220-\223]|\226[\240-\275]|\227[\206-\257]|\230[\205\206])/,'')
138
+
139
+ # eliminate useless html tags
140
+ html.gsub!(/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/imn, '')
141
+ html.gsub!(/<!--.*?-->/m, '')
142
+ html.gsub!(/<![A-Za-z].*?>/s, '')
143
+ html.gsub!(/<div\s[^>]*class\s*=\s*['"]?alpslab-slide["']?[^>]*>.*?<\/div\s*>/m, '')
144
+ html.gsub!(/<div\s[^>]*(id|class)\s*=\s*['"]?\S*more\S*["']?[^>]*>/is, '')
145
+
146
+ html
147
+ end
148
+
149
+ # Checks if the given block has only tags without text.
150
+ def has_only_tags(st)
151
+ st.gsub(/<[^>]*>/imn, '').gsub("&nbsp;",'').strip.length == 0
152
+ end
153
+
154
+ # eliminates link tags
155
+ def eliminate_link(html)
156
+ count = 0
157
+ notlinked = html.gsub(/<a\s[^>]*>.*?<\/a\s*>/imn){count+=1;''}.gsub(/<form\s[^>]*>.*?<\/form\s*>/imn, '')
158
+ notlinked = strip_tags(notlinked)
159
+ return "" if notlinked.length < 20 * count || islinklist(html)
160
+ return notlinked
161
+ end
162
+
163
+ # determines whether a block is link list or not
164
+ def islinklist(st)
165
+ if st=~/<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/ism
166
+ listpart = $1
167
+ outside = st.gsub(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/ismn, '').gsub(/<.+?>/mn, '').gsub(/\s+/, ' ')
168
+ list = listpart.split(/<li[^>]*>/)
169
+ list.shift
170
+ rate = evaluate_list(list)
171
+ outside.length <= st.length / (45 / rate)
172
+ end
173
+ end
174
+
175
+ # estimates how much degree of link list
176
+ def evaluate_list(list)
177
+ return 1 if list.length == 0
178
+ hit = 0
179
+ list.each do |line|
180
+ hit +=1 if line =~ /<a\s+href=(['"]?)([^"'\s]+)\1/imn
181
+ end
182
+ return 9 * (1.0 * hit / list.length) ** 2 + 1
183
+ end
184
+
185
+ # Strips tags from html.
186
+ def strip_tags(html)
187
+ st = html.gsub(/<.+?>/m, '')
188
+ # Convert from wide character to ascii
189
+ st.gsub!(/\357\274([\201-\272])/){($1[0]-96).chr} # symbols, 0-9, A-Z
190
+ st.gsub!(/\357\275([\201-\232])/){($1[0]-32).chr} # a-z
191
+ st.gsub!(/\342[\224\225][\200-\277]/, '') # keisen
192
+ st.gsub!(/\343\200\200/, ' ')
193
+ CHARREF.each{|ref, c| st.gsub!(ref, c) }
194
+ require 'cgi'
195
+ st = CGI.unescapeHTML(st)
196
+ st.gsub(/[ \t]+/, " ")
197
+ st.gsub(/\n\s*/, "\n")
198
+ end
199
+ end
200
+
201
+ include Extractor.new
202
+ end
data/log/debug.log ADDED
File without changes
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)