forum_post 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/COPYING +1 -0
- data/LICENSE +1 -0
- data/README +5 -0
- data/Rakefile +62 -0
- data/lib/forum_post/init.rb +6 -0
- data/lib/forum_post.rb +174 -0
- data/resources/defaults.yaml +2 -0
- metadata +86 -0
data/COPYING
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
No copying restrictions/license given.
|
data/LICENSE
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
No license given.
|
data/README
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rake/clean'
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
require 'rake/rdoctask'
|
6
|
+
require 'fileutils'
|
7
|
+
include FileUtils
|
8
|
+
|
9
|
+
version="0.1"
|
10
|
+
name="forum_post"
|
11
|
+
|
12
|
+
spec = Gem::Specification.new do |s|
|
13
|
+
s.name = name
|
14
|
+
s.version = version
|
15
|
+
s.description = s.summary = "Get the forum post"
|
16
|
+
s.author = "ltl3884"
|
17
|
+
s.email = "ltl3884@gmail.com"
|
18
|
+
s.add_dependency('nokogiri', '>= 1.4')
|
19
|
+
|
20
|
+
s.platform = Gem::Platform::RUBY
|
21
|
+
s.has_rdoc = true
|
22
|
+
s.extra_rdoc_files = ["README"]
|
23
|
+
|
24
|
+
s.files = %w(COPYING LICENSE README Rakefile) +
|
25
|
+
Dir.glob("{bin,doc/rdoc,test,lib}/**/*") +
|
26
|
+
Dir.glob("ext/**/*.{h,c,rb}") +
|
27
|
+
Dir.glob("examples/**/*.rb") +
|
28
|
+
Dir.glob("tools/*.rb") +
|
29
|
+
Dir.glob("resources/**/*")
|
30
|
+
|
31
|
+
s.require_path = "lib"
|
32
|
+
s.bindir = "bin"
|
33
|
+
end
|
34
|
+
|
35
|
+
Rake::GemPackageTask.new(spec) do |p|
|
36
|
+
p.need_tar = true if RUBY_PLATFORM !~ /mswin/
|
37
|
+
end
|
38
|
+
|
39
|
+
task :install => [:test, :package] do
|
40
|
+
sh %{sudo gem install pkg/#{name}-#{version}.gem}
|
41
|
+
end
|
42
|
+
|
43
|
+
task :uninstall => [:clean] do
|
44
|
+
sh %{sudo gem uninstall #{name}}
|
45
|
+
end
|
46
|
+
|
47
|
+
Rake::TestTask.new do |t|
|
48
|
+
t.libs << "test"
|
49
|
+
t.test_files = FileList['test/test*.rb']
|
50
|
+
t.verbose = true
|
51
|
+
end
|
52
|
+
|
53
|
+
Rake::RDocTask.new do |rdoc|
|
54
|
+
rdoc.rdoc_dir = 'doc/rdoc'
|
55
|
+
rdoc.options << '--line-numbers'
|
56
|
+
rdoc.rdoc_files.add ['README', 'LICENSE', 'COPYING', 'lib/**/*.rb', 'doc/**/*.rdoc']
|
57
|
+
end
|
58
|
+
|
59
|
+
task :default => [:test, :package]
|
60
|
+
|
61
|
+
CLEAN.include ['build/*', '**/*.o', '**/*.so', '**/*.a', 'lib/*-*', '**/*.log', 'pkg', 'lib/*.bundle', '*.gem', '.config']
|
62
|
+
|
data/lib/forum_post.rb
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
module ForumPost
|
4
|
+
class Document
|
5
|
+
DEFAULT_OPTIONS = {
|
6
|
+
:min_length => 5,
|
7
|
+
:min_text_length => 15
|
8
|
+
}.freeze
|
9
|
+
|
10
|
+
REGEXES = {
|
11
|
+
:unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
12
|
+
:maybe_post => /article|body|column|main|content|post|topic|text|info|message/i,
|
13
|
+
:not_post => /author|head|avatar|profile|rank|user|uid/i,
|
14
|
+
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
15
|
+
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
16
|
+
:replaceFontsRe => /<(\/?)font[^>]*>/i
|
17
|
+
}
|
18
|
+
|
19
|
+
attr_accessor :options, :html
|
20
|
+
|
21
|
+
def initialize(input, options = {})
|
22
|
+
@input = input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
|
23
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
24
|
+
make_html
|
25
|
+
end
|
26
|
+
|
27
|
+
def make_html
|
28
|
+
@html = Nokogiri::HTML(@input, nil, 'UTF-8')
|
29
|
+
end
|
30
|
+
|
31
|
+
def content
|
32
|
+
remove_script_and_style!#去除scrpt和style
|
33
|
+
remove_unlikely_candidates!#去除不可能是发帖内容的标签
|
34
|
+
transform_misused_divs_into_p!#把一些噪音的div转换成p标签
|
35
|
+
if likely_posts.size > 0
|
36
|
+
#回帖数大于5
|
37
|
+
most_likely_post = most_likely_posts(likely_posts)
|
38
|
+
candidates=score_elem(most_likely_post)
|
39
|
+
else
|
40
|
+
#回帖数小于5
|
41
|
+
candidates=score_elem(@html)
|
42
|
+
end
|
43
|
+
best_elem=select_best(candidates)
|
44
|
+
best_elem.text
|
45
|
+
end
|
46
|
+
|
47
|
+
def score_elem(most_likely_post)
|
48
|
+
candidates=most_likely_post.css("div,tr,td")
|
49
|
+
candidates.each do |elem|
|
50
|
+
base_score=100
|
51
|
+
str = "#{elem[:class]}#{elem[:id]}"
|
52
|
+
base_score+=10 if str =~ REGEXES[:maybe_post]
|
53
|
+
base_score-=20 if str =~ REGEXES[:not_post]
|
54
|
+
base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
|
55
|
+
elem["score"]=base_score.to_s
|
56
|
+
end
|
57
|
+
candidates
|
58
|
+
end
|
59
|
+
|
60
|
+
def select_best(candidates)
|
61
|
+
last_candidates=[]
|
62
|
+
candidates=candidates.sort{|a,b| b["score"].to_i<=>a["score"].to_i}
|
63
|
+
best_score=candidates.first["score"]
|
64
|
+
candidates.delete_if{|c| c["score"]!=best_score}
|
65
|
+
return candidates.first if candidates.size==1
|
66
|
+
candidates.each do |p|
|
67
|
+
flag=0
|
68
|
+
candidates.each do |q|
|
69
|
+
if is_contain(p,q) == true && p != q
|
70
|
+
flag+=1
|
71
|
+
end
|
72
|
+
end
|
73
|
+
last_candidates<<p if flag==0
|
74
|
+
end
|
75
|
+
return last_candidates.first if last_candidates.size==1
|
76
|
+
last_candidates.each do |lc|
|
77
|
+
lc["text_rate"] = (elem_size(lc)/elem_size(lc,'inner_html').to_f).to_s
|
78
|
+
end
|
79
|
+
last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
|
80
|
+
end
|
81
|
+
|
82
|
+
def most_likely_posts(likely_posts)
|
83
|
+
most_likely_posts=[]
|
84
|
+
likely_posts.each do |q|
|
85
|
+
flag=0
|
86
|
+
likely_posts.each do |p|
|
87
|
+
if is_contain(p,q) == true && p != q
|
88
|
+
flag+=1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
if flag == 0
|
92
|
+
most_likely_posts << q
|
93
|
+
debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
|
94
|
+
end
|
95
|
+
end
|
96
|
+
most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
|
97
|
+
end
|
98
|
+
|
99
|
+
def elem_size(elem,type='inner_text')
|
100
|
+
return elem.text.gsub(/\s(\s+)/,"").size if type=='inner_text'
|
101
|
+
return elem.inner_html.gsub(/\s(\s+)/,"").size if type=='inner_html'
|
102
|
+
end
|
103
|
+
|
104
|
+
def likely_posts
|
105
|
+
h={}
|
106
|
+
likely_posts=[]
|
107
|
+
@html.css("div,tr,td").each do |elem|
|
108
|
+
str = "#{elem[:class]}#{elem[:id]}"
|
109
|
+
if str =~ REGEXES[:maybe_post]
|
110
|
+
flag="#{elem.name},#{elem[:class]}"
|
111
|
+
if h.keys.include?(flag)
|
112
|
+
h[flag] << elem
|
113
|
+
else
|
114
|
+
h[flag] = ([] << elem)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
h.delete_if{|k,v| v.size < DEFAULT_OPTIONS[:min_length]}
|
119
|
+
h.map{|k,v| likely_posts << v.first}
|
120
|
+
likely_posts
|
121
|
+
end
|
122
|
+
|
123
|
+
def remove_script_and_style!
|
124
|
+
@html.css("script, style").each { |i| i.remove }
|
125
|
+
end
|
126
|
+
|
127
|
+
def remove_unlikely_candidates!
|
128
|
+
@html.css("*").each do |elem|
|
129
|
+
str = "#{elem[:class]}#{elem[:id]}"
|
130
|
+
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:maybe_post] && elem.name.downcase != 'body'
|
131
|
+
debug("Removing unlikely candidate - #{str}")
|
132
|
+
elem.remove
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def transform_misused_divs_into_p!
|
138
|
+
@html.css("*").each do |elem|
|
139
|
+
if elem.name.downcase == "div"
|
140
|
+
if elem.inner_html !~ REGEXES[:divToPElementsRe]
|
141
|
+
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
142
|
+
elem.name = "p"
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def is_contain(p,q)
|
149
|
+
p.css("div,tr,td").each do |item|
|
150
|
+
return true if item.name == q.name && item[:class] == q[:class]
|
151
|
+
end
|
152
|
+
return false
|
153
|
+
end
|
154
|
+
|
155
|
+
def debug(str)
|
156
|
+
puts str if options[:debug]
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
#require 'open-uri'
|
164
|
+
#include ForumPost
|
165
|
+
#uri = 'http://topic.csdn.net/u/20110718/17/f1a523fb-8c65-4510-a094-daf7bd2698cf.html?50656'
|
166
|
+
#uri = 'http://bbs.lampchina.net/thread-29528-1-1.html'
|
167
|
+
#uri = 'http://bbs.55bbs.com/thread-5662509-1-1.html'
|
168
|
+
|
169
|
+
#uri='http://topic.csdn.net/u/20110803/11/7ef08712-129d-4971-88a4-4bd0b712c804.html?90401'
|
170
|
+
#f = open(uri).read
|
171
|
+
#charset = adjudge_html_charset(f)
|
172
|
+
#source = convert_to_utf8(f,charset)
|
173
|
+
#d = Document.new(source)
|
174
|
+
#puts d.content
|
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: forum_post
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 9
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: "0.1"
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- ltl3884
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-08-04 00:00:00 +08:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: nokogiri
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 7
|
29
|
+
segments:
|
30
|
+
- 1
|
31
|
+
- 4
|
32
|
+
version: "1.4"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
description: Get the forum post
|
36
|
+
email: ltl3884@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- README
|
43
|
+
files:
|
44
|
+
- COPYING
|
45
|
+
- LICENSE
|
46
|
+
- README
|
47
|
+
- Rakefile
|
48
|
+
- lib/forum_post.rb
|
49
|
+
- lib/forum_post/init.rb
|
50
|
+
- resources/defaults.yaml
|
51
|
+
has_rdoc: true
|
52
|
+
homepage:
|
53
|
+
licenses: []
|
54
|
+
|
55
|
+
post_install_message:
|
56
|
+
rdoc_options: []
|
57
|
+
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
hash: 3
|
66
|
+
segments:
|
67
|
+
- 0
|
68
|
+
version: "0"
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
hash: 3
|
75
|
+
segments:
|
76
|
+
- 0
|
77
|
+
version: "0"
|
78
|
+
requirements: []
|
79
|
+
|
80
|
+
rubyforge_project:
|
81
|
+
rubygems_version: 1.3.7
|
82
|
+
signing_key:
|
83
|
+
specification_version: 3
|
84
|
+
summary: Get the forum post
|
85
|
+
test_files: []
|
86
|
+
|