forum_post 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/COPYING +1 -0
- data/LICENSE +1 -0
- data/README +5 -0
- data/Rakefile +62 -0
- data/lib/forum_post/init.rb +6 -0
- data/lib/forum_post.rb +174 -0
- data/resources/defaults.yaml +2 -0
- metadata +86 -0
data/COPYING
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
No copying restrictions/license given.
|
data/LICENSE
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
No license given.
|
data/README
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rake/clean'
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
require 'rake/rdoctask'
|
6
|
+
require 'fileutils'
|
7
|
+
include FileUtils
|
8
|
+
|
9
|
+
version="0.1"
|
10
|
+
name="forum_post"
|
11
|
+
|
12
|
+
spec = Gem::Specification.new do |s|
|
13
|
+
s.name = name
|
14
|
+
s.version = version
|
15
|
+
s.description = s.summary = "Get the forum post"
|
16
|
+
s.author = "ltl3884"
|
17
|
+
s.email = "ltl3884@gmail.com"
|
18
|
+
s.add_dependency('nokogiri', '>= 1.4')
|
19
|
+
|
20
|
+
s.platform = Gem::Platform::RUBY
|
21
|
+
s.has_rdoc = true
|
22
|
+
s.extra_rdoc_files = ["README"]
|
23
|
+
|
24
|
+
s.files = %w(COPYING LICENSE README Rakefile) +
|
25
|
+
Dir.glob("{bin,doc/rdoc,test,lib}/**/*") +
|
26
|
+
Dir.glob("ext/**/*.{h,c,rb}") +
|
27
|
+
Dir.glob("examples/**/*.rb") +
|
28
|
+
Dir.glob("tools/*.rb") +
|
29
|
+
Dir.glob("resources/**/*")
|
30
|
+
|
31
|
+
s.require_path = "lib"
|
32
|
+
s.bindir = "bin"
|
33
|
+
end
|
34
|
+
|
35
|
+
Rake::GemPackageTask.new(spec) do |p|
|
36
|
+
p.need_tar = true if RUBY_PLATFORM !~ /mswin/
|
37
|
+
end
|
38
|
+
|
39
|
+
task :install => [:test, :package] do
|
40
|
+
sh %{sudo gem install pkg/#{name}-#{version}.gem}
|
41
|
+
end
|
42
|
+
|
43
|
+
task :uninstall => [:clean] do
|
44
|
+
sh %{sudo gem uninstall #{name}}
|
45
|
+
end
|
46
|
+
|
47
|
+
Rake::TestTask.new do |t|
|
48
|
+
t.libs << "test"
|
49
|
+
t.test_files = FileList['test/test*.rb']
|
50
|
+
t.verbose = true
|
51
|
+
end
|
52
|
+
|
53
|
+
Rake::RDocTask.new do |rdoc|
|
54
|
+
rdoc.rdoc_dir = 'doc/rdoc'
|
55
|
+
rdoc.options << '--line-numbers'
|
56
|
+
rdoc.rdoc_files.add ['README', 'LICENSE', 'COPYING', 'lib/**/*.rb', 'doc/**/*.rdoc']
|
57
|
+
end
|
58
|
+
|
59
|
+
task :default => [:test, :package]
|
60
|
+
|
61
|
+
CLEAN.include ['build/*', '**/*.o', '**/*.so', '**/*.a', 'lib/*-*', '**/*.log', 'pkg', 'lib/*.bundle', '*.gem', '.config']
|
62
|
+
|
data/lib/forum_post.rb
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
module ForumPost
|
4
|
+
class Document
|
5
|
+
DEFAULT_OPTIONS = {
|
6
|
+
:min_length => 5,
|
7
|
+
:min_text_length => 15
|
8
|
+
}.freeze
|
9
|
+
|
10
|
+
REGEXES = {
|
11
|
+
:unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
12
|
+
:maybe_post => /article|body|column|main|content|post|topic|text|info|message/i,
|
13
|
+
:not_post => /author|head|avatar|profile|rank|user|uid/i,
|
14
|
+
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
15
|
+
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
16
|
+
:replaceFontsRe => /<(\/?)font[^>]*>/i
|
17
|
+
}
|
18
|
+
|
19
|
+
attr_accessor :options, :html
|
20
|
+
|
21
|
+
def initialize(input, options = {})
|
22
|
+
@input = input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
|
23
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
24
|
+
make_html
|
25
|
+
end
|
26
|
+
|
27
|
+
def make_html
|
28
|
+
@html = Nokogiri::HTML(@input, nil, 'UTF-8')
|
29
|
+
end
|
30
|
+
|
31
|
+
def content
|
32
|
+
remove_script_and_style!#去除scrpt和style
|
33
|
+
remove_unlikely_candidates!#去除不可能是发帖内容的标签
|
34
|
+
transform_misused_divs_into_p!#把一些噪音的div转换成p标签
|
35
|
+
if likely_posts.size > 0
|
36
|
+
#回帖数大于5
|
37
|
+
most_likely_post = most_likely_posts(likely_posts)
|
38
|
+
candidates=score_elem(most_likely_post)
|
39
|
+
else
|
40
|
+
#回帖数小于5
|
41
|
+
candidates=score_elem(@html)
|
42
|
+
end
|
43
|
+
best_elem=select_best(candidates)
|
44
|
+
best_elem.text
|
45
|
+
end
|
46
|
+
|
47
|
+
def score_elem(most_likely_post)
|
48
|
+
candidates=most_likely_post.css("div,tr,td")
|
49
|
+
candidates.each do |elem|
|
50
|
+
base_score=100
|
51
|
+
str = "#{elem[:class]}#{elem[:id]}"
|
52
|
+
base_score+=10 if str =~ REGEXES[:maybe_post]
|
53
|
+
base_score-=20 if str =~ REGEXES[:not_post]
|
54
|
+
base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
|
55
|
+
elem["score"]=base_score.to_s
|
56
|
+
end
|
57
|
+
candidates
|
58
|
+
end
|
59
|
+
|
60
|
+
def select_best(candidates)
|
61
|
+
last_candidates=[]
|
62
|
+
candidates=candidates.sort{|a,b| b["score"].to_i<=>a["score"].to_i}
|
63
|
+
best_score=candidates.first["score"]
|
64
|
+
candidates.delete_if{|c| c["score"]!=best_score}
|
65
|
+
return candidates.first if candidates.size==1
|
66
|
+
candidates.each do |p|
|
67
|
+
flag=0
|
68
|
+
candidates.each do |q|
|
69
|
+
if is_contain(p,q) == true && p != q
|
70
|
+
flag+=1
|
71
|
+
end
|
72
|
+
end
|
73
|
+
last_candidates<<p if flag==0
|
74
|
+
end
|
75
|
+
return last_candidates.first if last_candidates.size==1
|
76
|
+
last_candidates.each do |lc|
|
77
|
+
lc["text_rate"] = (elem_size(lc)/elem_size(lc,'inner_html').to_f).to_s
|
78
|
+
end
|
79
|
+
last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
|
80
|
+
end
|
81
|
+
|
82
|
+
def most_likely_posts(likely_posts)
|
83
|
+
most_likely_posts=[]
|
84
|
+
likely_posts.each do |q|
|
85
|
+
flag=0
|
86
|
+
likely_posts.each do |p|
|
87
|
+
if is_contain(p,q) == true && p != q
|
88
|
+
flag+=1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
if flag == 0
|
92
|
+
most_likely_posts << q
|
93
|
+
debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
|
94
|
+
end
|
95
|
+
end
|
96
|
+
most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
|
97
|
+
end
|
98
|
+
|
99
|
+
def elem_size(elem,type='inner_text')
|
100
|
+
return elem.text.gsub(/\s(\s+)/,"").size if type=='inner_text'
|
101
|
+
return elem.inner_html.gsub(/\s(\s+)/,"").size if type=='inner_html'
|
102
|
+
end
|
103
|
+
|
104
|
+
def likely_posts
|
105
|
+
h={}
|
106
|
+
likely_posts=[]
|
107
|
+
@html.css("div,tr,td").each do |elem|
|
108
|
+
str = "#{elem[:class]}#{elem[:id]}"
|
109
|
+
if str =~ REGEXES[:maybe_post]
|
110
|
+
flag="#{elem.name},#{elem[:class]}"
|
111
|
+
if h.keys.include?(flag)
|
112
|
+
h[flag] << elem
|
113
|
+
else
|
114
|
+
h[flag] = ([] << elem)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
h.delete_if{|k,v| v.size < DEFAULT_OPTIONS[:min_length]}
|
119
|
+
h.map{|k,v| likely_posts << v.first}
|
120
|
+
likely_posts
|
121
|
+
end
|
122
|
+
|
123
|
+
def remove_script_and_style!
|
124
|
+
@html.css("script, style").each { |i| i.remove }
|
125
|
+
end
|
126
|
+
|
127
|
+
def remove_unlikely_candidates!
|
128
|
+
@html.css("*").each do |elem|
|
129
|
+
str = "#{elem[:class]}#{elem[:id]}"
|
130
|
+
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:maybe_post] && elem.name.downcase != 'body'
|
131
|
+
debug("Removing unlikely candidate - #{str}")
|
132
|
+
elem.remove
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def transform_misused_divs_into_p!
|
138
|
+
@html.css("*").each do |elem|
|
139
|
+
if elem.name.downcase == "div"
|
140
|
+
if elem.inner_html !~ REGEXES[:divToPElementsRe]
|
141
|
+
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
142
|
+
elem.name = "p"
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def is_contain(p,q)
|
149
|
+
p.css("div,tr,td").each do |item|
|
150
|
+
return true if item.name == q.name && item[:class] == q[:class]
|
151
|
+
end
|
152
|
+
return false
|
153
|
+
end
|
154
|
+
|
155
|
+
def debug(str)
|
156
|
+
puts str if options[:debug]
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
#require 'open-uri'
|
164
|
+
#include ForumPost
|
165
|
+
#uri = 'http://topic.csdn.net/u/20110718/17/f1a523fb-8c65-4510-a094-daf7bd2698cf.html?50656'
|
166
|
+
#uri = 'http://bbs.lampchina.net/thread-29528-1-1.html'
|
167
|
+
#uri = 'http://bbs.55bbs.com/thread-5662509-1-1.html'
|
168
|
+
|
169
|
+
#uri='http://topic.csdn.net/u/20110803/11/7ef08712-129d-4971-88a4-4bd0b712c804.html?90401'
|
170
|
+
#f = open(uri).read
|
171
|
+
#charset = adjudge_html_charset(f)
|
172
|
+
#source = convert_to_utf8(f,charset)
|
173
|
+
#d = Document.new(source)
|
174
|
+
#puts d.content
|
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: forum_post
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 9
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: "0.1"
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- ltl3884
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-08-04 00:00:00 +08:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: nokogiri
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 7
|
29
|
+
segments:
|
30
|
+
- 1
|
31
|
+
- 4
|
32
|
+
version: "1.4"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
description: Get the forum post
|
36
|
+
email: ltl3884@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- README
|
43
|
+
files:
|
44
|
+
- COPYING
|
45
|
+
- LICENSE
|
46
|
+
- README
|
47
|
+
- Rakefile
|
48
|
+
- lib/forum_post.rb
|
49
|
+
- lib/forum_post/init.rb
|
50
|
+
- resources/defaults.yaml
|
51
|
+
has_rdoc: true
|
52
|
+
homepage:
|
53
|
+
licenses: []
|
54
|
+
|
55
|
+
post_install_message:
|
56
|
+
rdoc_options: []
|
57
|
+
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
hash: 3
|
66
|
+
segments:
|
67
|
+
- 0
|
68
|
+
version: "0"
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
hash: 3
|
75
|
+
segments:
|
76
|
+
- 0
|
77
|
+
version: "0"
|
78
|
+
requirements: []
|
79
|
+
|
80
|
+
rubyforge_project:
|
81
|
+
rubygems_version: 1.3.7
|
82
|
+
signing_key:
|
83
|
+
specification_version: 3
|
84
|
+
summary: Get the forum post
|
85
|
+
test_files: []
|
86
|
+
|