extractcontent 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt ADDED
@@ -0,0 +1,4 @@
1
+ == 0.0.1 2008-03-24
2
+
3
+ * 1 major enhancement:
4
+ * Initial release
data/License.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2007/2008 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Manifest.txt ADDED
@@ -0,0 +1,24 @@
1
+ History.txt
2
+ License.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ config/hoe.rb
7
+ config/requirements.rb
8
+ lib/extractcontent.rb
9
+ lib/extractcontent/version.rb
10
+ log/debug.log
11
+ script/destroy
12
+ script/generate
13
+ script/txt2html
14
+ setup.rb
15
+ tasks/deployment.rake
16
+ tasks/environment.rake
17
+ tasks/website.rake
18
+ test/test_extractcontent.rb
19
+ test/test_helper.rb
20
+ website/index.html
21
+ website/index.txt
22
+ website/javascripts/rounded_corners_lite.inc.js
23
+ website/stylesheets/screen.css
24
+ website/template.rhtml
data/README.txt ADDED
@@ -0,0 +1,48 @@
1
+ = extractcontent
2
+
3
+ * FIX (url)
4
+
5
+ == DESCRIPTION:
6
+
7
+ FIX (describe your package)
8
+
9
+ == FEATURES/PROBLEMS:
10
+
11
+ * FIX (list of features or problems)
12
+
13
+ == SYNOPSIS:
14
+
15
+ FIX (code sample of usage)
16
+
17
+ == REQUIREMENTS:
18
+
19
+ * FIX (list of requirements)
20
+
21
+ == INSTALL:
22
+
23
+ * FIX (sudo gem install, anything else)
24
+
25
+ == LICENSE:
26
+
27
+ (The MIT License)
28
+
29
+ Copyright (c) 2008 FIX
30
+
31
+ Permission is hereby granted, free of charge, to any person obtaining
32
+ a copy of this software and associated documentation files (the
33
+ 'Software'), to deal in the Software without restriction, including
34
+ without limitation the rights to use, copy, modify, merge, publish,
35
+ distribute, sublicense, and/or sell copies of the Software, and to
36
+ permit persons to whom the Software is furnished to do so, subject to
37
+ the following conditions:
38
+
39
+ The above copyright notice and this permission notice shall be
40
+ included in all copies or substantial portions of the Software.
41
+
42
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
43
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
44
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
45
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
46
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
47
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
48
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ require 'config/requirements'
2
+ require 'config/hoe' # setup Hoe + all gem configuration
3
+
4
+ Dir['tasks/**/*.rake'].each { |rake| load rake }
data/config/hoe.rb ADDED
@@ -0,0 +1,70 @@
1
+ require 'extractcontent/version'
2
+
3
+ AUTHOR = 'Nakatani Shuyo' # can also be an array of Authors
4
+ EMAIL = "nakatani.shuyo@gmail.com"
5
+ DESCRIPTION = "This module is to extract the text from web page(html)."
6
+ GEM_NAME = 'extractcontent' # what ppl will type to install your gem
7
+ RUBYFORGE_PROJECT = 'extractcontent' # The unix name for your project
8
+ HOMEPATH = "http://#{RUBYFORGE_PROJECT}.rubyforge.org"
9
+ DOWNLOAD_PATH = "http://rubyforge.org/projects/#{RUBYFORGE_PROJECT}"
10
+
11
+ @config_file = "~/.rubyforge/user-config.yml"
12
+ @config = nil
13
+ RUBYFORGE_USERNAME = "shuyo"
14
+ def rubyforge_username
15
+ unless @config
16
+ begin
17
+ @config = YAML.load(File.read(File.expand_path(@config_file)))
18
+ rescue
19
+ puts <<-EOS
20
+ ERROR: No rubyforge config file found: #{@config_file}
21
+ Run 'rubyforge setup' to prepare your env for access to Rubyforge
22
+ - See http://newgem.rubyforge.org/rubyforge.html for more details
23
+ EOS
24
+ exit
25
+ end
26
+ end
27
+ RUBYFORGE_USERNAME.replace @config["username"]
28
+ end
29
+
30
+
31
+ REV = nil
32
+ # UNCOMMENT IF REQUIRED:
33
+ # REV = `svn info`.each {|line| if line =~ /^Revision:/ then k,v = line.split(': '); break v.chomp; else next; end} rescue nil
34
+ VERS = ExtractContent::VERSION::STRING + (REV ? ".#{REV}" : "")
35
+ RDOC_OPTS = ['--quiet', '--title', 'extractcontent documentation',
36
+ "--opname", "index.html",
37
+ "--line-numbers",
38
+ "--main", "README",
39
+ "--inline-source"]
40
+
41
+ class Hoe
42
+ def extra_deps
43
+ @extra_deps.reject! { |x| Array(x).first == 'hoe' }
44
+ @extra_deps
45
+ end
46
+ end
47
+
48
+ # Generate all the Rake tasks
49
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
50
+ hoe = Hoe.new(GEM_NAME, VERS) do |p|
51
+ p.developer(AUTHOR, EMAIL)
52
+ p.description = DESCRIPTION
53
+ p.summary = DESCRIPTION
54
+ p.url = HOMEPATH
55
+ p.rubyforge_name = RUBYFORGE_PROJECT if RUBYFORGE_PROJECT
56
+ p.test_globs = ["test/**/test_*.rb"]
57
+ p.clean_globs |= ['**/.*.sw?', '*.gem', '.config', '**/.DS_Store'] #An array of file patterns to delete on clean.
58
+
59
+ # == Optional
60
+ p.changes = p.paragraphs_of("History.txt", 0..1).join("\n\n")
61
+ #p.extra_deps = [] # An array of rubygem dependencies [name, version], e.g. [ ['active_support', '>= 1.3.1'] ]
62
+
63
+ #p.spec_extras = {} # A hash of extra values to set in the gemspec.
64
+
65
+ end
66
+
67
+ CHANGES = hoe.paragraphs_of('History.txt', 0..1).join("\\n\\n")
68
+ PATH = (RUBYFORGE_PROJECT == GEM_NAME) ? RUBYFORGE_PROJECT : "#{RUBYFORGE_PROJECT}/#{GEM_NAME}"
69
+ hoe.remote_rdoc_dir = File.join(PATH.gsub(/^#{RUBYFORGE_PROJECT}\/?/,''), 'rdoc')
70
+ hoe.rsync_args = '-av --delete --ignore-errors'
@@ -0,0 +1,17 @@
1
+ require 'fileutils'
2
+ include FileUtils
3
+
4
+ require 'rubygems'
5
+ %w[rake hoe newgem rubigen].each do |req_gem|
6
+ begin
7
+ require req_gem
8
+ rescue LoadError
9
+ puts "This Rakefile requires the '#{req_gem}' RubyGem."
10
+ puts "Installation: gem install #{req_gem} -y"
11
+ exit
12
+ end
13
+ end
14
+
15
+ $:.unshift(File.join(File.dirname(__FILE__), %w[.. lib]))
16
+
17
+ require 'extractcontent'
@@ -0,0 +1,9 @@
1
+ module ExtractContent #:nodoc:
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 0
5
+ TINY = 1
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
@@ -0,0 +1,202 @@
1
+ #!/usr/bin/ruby -Ku
2
+ $KCODE="u"
3
+
4
+ # Author:: Nakatani Shuyo
5
+ # Copyright:: (c)2007/2008 Cybozu Labs Inc. All rights reserved.
6
+ # License:: BSD
7
+
8
+ # = ExtractContent : Extract Content Module for html
9
+ # This module is to extract the text from web page ( html content ).
10
+ # Automatically extracts sub blocks of html which have much possibility that it is the text
11
+ # ( except for menu, comments, navigations, affiliate links and so on ).
12
+
13
+ # == PROCESSES
14
+ # - separating blocks from html, calculating score of blocks and ignoring low score blocks.
15
+ # - for score calculation, using block arrangement, text length, whether has affiliate links or characteristic keywords
16
+ # - clustering continuous, high score blocks and comparing amang clusters
17
+ # - if including "Google AdSense Section Target", noticing it in particular
18
+
19
+ module ExtractContent
20
+ # onvert from character entity references
21
+ CHARREF = {
22
+ '&nbsp;' => ' ',
23
+ '&lt;' => '<',
24
+ '&gt;' => '>',
25
+ '&amp;' => '&',
26
+ '&laquo;'=> "\xc2\xab",
27
+ '&raquo;'=> "\xc2\xbb",
28
+ }
29
+
30
+ # Default option parameters.
31
+ DEFAULT = {
32
+ :threshold => 100, # threhold for score of the text
33
+ :min_length => 80, # minimum length of evaluated blocks
34
+ :decay_factor => 0.73, # decay factor for block score
35
+ :continuous_factor => 1.62, # continuous factor for block score ( the larger, the harder to continue )
36
+ :punctuation_weight => 10, # score weight for punctuations
37
+ :punctuations => /(\343\200[\201\202]|\357\274[\201\214\216\237]|\.[^A-Za-z0-9]|,[^0-9]|!|\?)/,
38
+ # punctuation characters
39
+ :waste_expressions => /Copyright|All Rights Reserved/i, # characteristic keywords including footer
40
+ :debug => false, # if true, output block information to stdout
41
+ }
42
+
43
+ class Extractor < Module
44
+
45
+ # Sets option parameters to default.
46
+ # Parameter opt is given as Hash
47
+ def initialize(opt=nil)
48
+ @default = DEFAULT.clone
49
+ @default.update(opt) if opt
50
+ end
51
+
52
+ # Analyses the given HTML text, extracts body and title.
53
+ def analyse(html, opt=nil)
54
+ # frameset or redirect
55
+ return ["", extract_title(html)] if html =~ /<\/frameset>|<meta\s+http-equiv\s*=\s*["']?refresh['"]?[^>]*url/i
56
+
57
+ # option parameters
58
+ opt = if opt then @default.merge(opt) else @default end
59
+
60
+ # header & title
61
+ title = if html =~ /<\/head\s*>/im
62
+ html = $' #'
63
+ extract_title($`)
64
+ else
65
+ extract_title(html)
66
+ end
67
+
68
+ # Google AdSense Section Target
69
+ html.gsub!(/<!--\s*google_ad_section_start\(weight=ignore\)\s*-->.*?<!--\s*google_ad_section_end.*?-->/m, '')
70
+ if html =~ /<!--\s*google_ad_section_start[^>]*-->/n
71
+ html = html.scan(/<!--\s*google_ad_section_start[^>]*-->.*?<!--\s*google_ad_section_end.*?-->/mn).join("\n")
72
+ end
73
+
74
+ # eliminate useless text
75
+ html = eliminate_useless_tags(html)
76
+
77
+ # heading tags including title
78
+ html.gsub!(/(<h\d\s*>\s*(.*?)\s*<\/h\d\s*>)/is) do |m|
79
+ if $2.length >= 3 && title.include?($2) then "<div>#{$2}</div>" else $1 end
80
+ end
81
+
82
+ # extract text blocks
83
+ factor = continuous = 1.0
84
+ body = ''
85
+ score = 0
86
+ bodylist = []
87
+ list = html.split(/<\/?(?:div|center|td)[^>]*>|<p\s*[^>]*class\s*=\s*["']?(?:posted|plugin-\w+)['"]?[^>]*>/)
88
+ list.each do |block|
89
+ next unless block
90
+ block.strip!
91
+ next if has_only_tags(block)
92
+ continuous /= opt[:continuous_factor] if body.length > 0
93
+
94
+ # ignore link list block
95
+ notlinked = eliminate_link(block)
96
+ next if notlinked.length < opt[:min_length]
97
+
98
+ # calculate score of block
99
+ c = (notlinked.length + notlinked.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor
100
+ factor *= opt[:decay_factor]
101
+ not_body_rate = block.scan(opt[:waste_expressions]).length + block.scan(/amazon[a-z0-9\.\/\-\?&]+-22/i).length / 2.0
102
+ c *= (0.72 ** not_body_rate) if not_body_rate>0
103
+ c1 = c * continuous
104
+ puts "----- #{c}*#{continuous}=#{c1} #{notlinked.length} \n#{strip_tags(block)[0,100]}\n" if opt[:debug]
105
+
106
+ # treat continuous blocks as cluster
107
+ if c1 > opt[:threshold]
108
+ body += block + "\n"
109
+ score += c1
110
+ continuous = opt[:continuous_factor]
111
+ elsif c > opt[:threshold] # continuous block end
112
+ bodylist << [body, score]
113
+ body = block + "\n"
114
+ score = c
115
+ continuous = opt[:continuous_factor]
116
+ end
117
+ end
118
+ bodylist << [body, score]
119
+ body = bodylist.inject{|a,b| if a[1]>=b[1] then a else b end }
120
+ [strip_tags(body[0]), title]
121
+ end
122
+
123
+ # Extracts title.
124
+ def extract_title(st)
125
+ if st =~ /<title[^>]*>\s*(.*?)\s*<\/title\s*>/im
126
+ strip_tags($1)
127
+ else
128
+ ""
129
+ end
130
+ end
131
+
132
+ private
133
+
134
+ # Eliminates useless tags
135
+ def eliminate_useless_tags(html)
136
+ # eliminate useless symbols
137
+ html.gsub!(/\342(?:\200[\230-\235]|\206[\220-\223]|\226[\240-\275]|\227[\206-\257]|\230[\205\206])/,'')
138
+
139
+ # eliminate useless html tags
140
+ html.gsub!(/<(script|style|select|noscript)[^>]*>.*?<\/\1\s*>/imn, '')
141
+ html.gsub!(/<!--.*?-->/m, '')
142
+ html.gsub!(/<![A-Za-z].*?>/s, '')
143
+ html.gsub!(/<div\s[^>]*class\s*=\s*['"]?alpslab-slide["']?[^>]*>.*?<\/div\s*>/m, '')
144
+ html.gsub!(/<div\s[^>]*(id|class)\s*=\s*['"]?\S*more\S*["']?[^>]*>/is, '')
145
+
146
+ html
147
+ end
148
+
149
+ # Checks if the given block has only tags without text.
150
+ def has_only_tags(st)
151
+ st.gsub(/<[^>]*>/imn, '').gsub("&nbsp;",'').strip.length == 0
152
+ end
153
+
154
+ # eliminates link tags
155
+ def eliminate_link(html)
156
+ count = 0
157
+ notlinked = html.gsub(/<a\s[^>]*>.*?<\/a\s*>/imn){count+=1;''}.gsub(/<form\s[^>]*>.*?<\/form\s*>/imn, '')
158
+ notlinked = strip_tags(notlinked)
159
+ return "" if notlinked.length < 20 * count || islinklist(html)
160
+ return notlinked
161
+ end
162
+
163
+ # determines whether a block is link list or not
164
+ def islinklist(st)
165
+ if st=~/<(?:ul|dl|ol)(.+?)<\/(?:ul|dl|ol)>/ism
166
+ listpart = $1
167
+ outside = st.gsub(/<(?:ul|dl)(.+?)<\/(?:ul|dl)>/ismn, '').gsub(/<.+?>/mn, '').gsub(/\s+/, ' ')
168
+ list = listpart.split(/<li[^>]*>/)
169
+ list.shift
170
+ rate = evaluate_list(list)
171
+ outside.length <= st.length / (45 / rate)
172
+ end
173
+ end
174
+
175
+ # estimates how much degree of link list
176
+ def evaluate_list(list)
177
+ return 1 if list.length == 0
178
+ hit = 0
179
+ list.each do |line|
180
+ hit +=1 if line =~ /<a\s+href=(['"]?)([^"'\s]+)\1/imn
181
+ end
182
+ return 9 * (1.0 * hit / list.length) ** 2 + 1
183
+ end
184
+
185
+ # Strips tags from html.
186
+ def strip_tags(html)
187
+ st = html.gsub(/<.+?>/m, '')
188
+ # Convert from wide character to ascii
189
+ st.gsub!(/\357\274([\201-\272])/){($1[0]-96).chr} # symbols, 0-9, A-Z
190
+ st.gsub!(/\357\275([\201-\232])/){($1[0]-32).chr} # a-z
191
+ st.gsub!(/\342[\224\225][\200-\277]/, '') # keisen
192
+ st.gsub!(/\343\200\200/, ' ')
193
+ CHARREF.each{|ref, c| st.gsub!(ref, c) }
194
+ require 'cgi'
195
+ st = CGI.unescapeHTML(st)
196
+ st.gsub(/[ \t]+/, " ")
197
+ st.gsub(/\n\s*/, "\n")
198
+ end
199
+ end
200
+
201
+ include Extractor.new
202
+ end
data/log/debug.log ADDED
File without changes
data/script/destroy ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/destroy'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Destroy.new.run(ARGV)
data/script/generate ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ APP_ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
3
+
4
+ begin
5
+ require 'rubigen'
6
+ rescue LoadError
7
+ require 'rubygems'
8
+ require 'rubigen'
9
+ end
10
+ require 'rubigen/scripts/generate'
11
+
12
+ ARGV.shift if ['--help', '-h'].include?(ARGV[0])
13
+ RubiGen::Base.use_component_sources! [:rubygems, :newgem, :newgem_theme, :test_unit]
14
+ RubiGen::Scripts::Generate.new.run(ARGV)