forum_post 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +12 -2
- data/Rakefile +1 -1
- data/lib/forum_post.rb +23 -20
- data/lib/{forum_post → forum_post_v0.2}/init.rb +1 -1
- metadata +4 -4
data/README
CHANGED
@@ -1,5 +1,15 @@
|
|
1
|
-
==
|
1
|
+
== Forum_post_v0.2/ GemPlugin
|
2
2
|
|
3
|
-
|
3
|
+
the gem used to get the forum post
|
4
4
|
|
5
|
+
install:
|
6
|
+
gem install forum_post
|
7
|
+
example:
|
8
|
+
require 'rubygems'
|
9
|
+
require 'forum_post'
|
10
|
+
require 'open-uri'
|
11
|
+
uri='http://topic.csdn.net/u/20110718/17/f1a523fb-8c65-4510-a094-daf7bd2698cf.html?50656'
|
12
|
+
source=open(uri).read
|
13
|
+
doc=ForumPost::Document.new(source)
|
14
|
+
puts doc.content
|
5
15
|
|
data/Rakefile
CHANGED
data/lib/forum_post.rb
CHANGED
@@ -9,7 +9,7 @@ module ForumPost
|
|
9
9
|
|
10
10
|
REGEXES = {
|
11
11
|
:unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
12
|
-
:maybe_post => /article|body|column|main|content|post|topic|text|info|message/i,
|
12
|
+
:maybe_post => /article|body|column|main|content|post|topic|text|info|message|item/i,
|
13
13
|
:not_post => /author|head|avatar|profile|rank|user|uid/i,
|
14
14
|
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
15
15
|
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
@@ -32,21 +32,23 @@ module ForumPost
|
|
32
32
|
remove_script_and_style!#去除scrpt和style
|
33
33
|
remove_unlikely_candidates!#去除不可能是发帖内容的标签
|
34
34
|
transform_misused_divs_into_p!#把一些噪音的div转换成p标签
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
35
|
+
bests=[]
|
36
|
+
better_post = likely_posts
|
37
|
+
if better_post.size > 0
|
38
|
+
debug("post>4")
|
39
|
+
@html = most_likely_posts(better_post)
|
40
|
+
else
|
41
|
+
debug("post<4")
|
42
|
+
end
|
43
|
+
bests=@html.css("div,tr,td")
|
44
|
+
return @html.text.gsub(/\s(\s+)/,"") if bests.size==0
|
45
|
+
candidates=score_elem(bests)
|
46
|
+
best_elem=select_best(candidates)
|
47
|
+
best_elem.text
|
45
48
|
end
|
46
49
|
|
47
|
-
def score_elem(
|
48
|
-
|
49
|
-
candidates.each do |elem|
|
50
|
+
def score_elem(bests)
|
51
|
+
bests.each do |elem|
|
50
52
|
base_score=100
|
51
53
|
str = "#{elem[:class]}#{elem[:id]}"
|
52
54
|
base_score+=10 if str =~ REGEXES[:maybe_post]
|
@@ -54,7 +56,7 @@ module ForumPost
|
|
54
56
|
base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
|
55
57
|
elem["score"]=base_score.to_s
|
56
58
|
end
|
57
|
-
|
59
|
+
bests
|
58
60
|
end
|
59
61
|
|
60
62
|
def select_best(candidates)
|
@@ -79,11 +81,11 @@ module ForumPost
|
|
79
81
|
last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
|
80
82
|
end
|
81
83
|
|
82
|
-
def most_likely_posts(
|
84
|
+
def most_likely_posts(better_post)
|
83
85
|
most_likely_posts=[]
|
84
|
-
|
86
|
+
better_post.each do |q|
|
85
87
|
flag=0
|
86
|
-
|
88
|
+
better_post.each do |p|
|
87
89
|
if is_contain(p,q) == true && p != q
|
88
90
|
flag+=1
|
89
91
|
end
|
@@ -93,6 +95,7 @@ module ForumPost
|
|
93
95
|
debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
|
94
96
|
end
|
95
97
|
end
|
98
|
+
debug("most_likely_posts:#{most_likely_posts.size}")
|
96
99
|
most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
|
97
100
|
end
|
98
101
|
|
@@ -128,7 +131,7 @@ module ForumPost
|
|
128
131
|
@html.css("*").each do |elem|
|
129
132
|
str = "#{elem[:class]}#{elem[:id]}"
|
130
133
|
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:maybe_post] && elem.name.downcase != 'body'
|
131
|
-
|
134
|
+
debug("Removing unlikely candidate - #{str}")
|
132
135
|
elem.remove
|
133
136
|
end
|
134
137
|
end
|
@@ -138,7 +141,7 @@ module ForumPost
|
|
138
141
|
@html.css("*").each do |elem|
|
139
142
|
if elem.name.downcase == "div"
|
140
143
|
if elem.inner_html !~ REGEXES[:divToPElementsRe]
|
141
|
-
|
144
|
+
#debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
142
145
|
elem.name = "p"
|
143
146
|
end
|
144
147
|
end
|
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forum_post
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 2
|
9
|
+
version: "0.2"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- ltl3884
|
@@ -46,7 +46,7 @@ files:
|
|
46
46
|
- README
|
47
47
|
- Rakefile
|
48
48
|
- lib/forum_post.rb
|
49
|
-
- lib/
|
49
|
+
- lib/forum_post_v0.2/init.rb
|
50
50
|
- resources/defaults.yaml
|
51
51
|
has_rdoc: true
|
52
52
|
homepage:
|