forum_post 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +12 -2
- data/Rakefile +1 -1
- data/lib/forum_post.rb +23 -20
- data/lib/{forum_post → forum_post_v0.2}/init.rb +1 -1
- metadata +4 -4
data/README
CHANGED
@@ -1,5 +1,15 @@
|
|
1
|
-
==
|
1
|
+
== Forum_post_v0.2/ GemPlugin
|
2
2
|
|
3
|
-
|
3
|
+
the gem used to get the forum post
|
4
4
|
|
5
|
+
install:
|
6
|
+
gem install forum_post
|
7
|
+
example:
|
8
|
+
require 'rubygems'
|
9
|
+
require 'forum_post'
|
10
|
+
require 'open-uri'
|
11
|
+
uri='http://topic.csdn.net/u/20110718/17/f1a523fb-8c65-4510-a094-daf7bd2698cf.html?50656'
|
12
|
+
source=open(uri).read
|
13
|
+
doc=ForumPost::Document.new(source)
|
14
|
+
puts doc.content
|
5
15
|
|
data/Rakefile
CHANGED
data/lib/forum_post.rb
CHANGED
@@ -9,7 +9,7 @@ module ForumPost
|
|
9
9
|
|
10
10
|
REGEXES = {
|
11
11
|
:unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
12
|
-
:maybe_post => /article|body|column|main|content|post|topic|text|info|message/i,
|
12
|
+
:maybe_post => /article|body|column|main|content|post|topic|text|info|message|item/i,
|
13
13
|
:not_post => /author|head|avatar|profile|rank|user|uid/i,
|
14
14
|
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
15
15
|
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
@@ -32,21 +32,23 @@ module ForumPost
|
|
32
32
|
remove_script_and_style!#去除scrpt和style
|
33
33
|
remove_unlikely_candidates!#去除不可能是发帖内容的标签
|
34
34
|
transform_misused_divs_into_p!#把一些噪音的div转换成p标签
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
35
|
+
bests=[]
|
36
|
+
better_post = likely_posts
|
37
|
+
if better_post.size > 0
|
38
|
+
debug("post>4")
|
39
|
+
@html = most_likely_posts(better_post)
|
40
|
+
else
|
41
|
+
debug("post<4")
|
42
|
+
end
|
43
|
+
bests=@html.css("div,tr,td")
|
44
|
+
return @html.text.gsub(/\s(\s+)/,"") if bests.size==0
|
45
|
+
candidates=score_elem(bests)
|
46
|
+
best_elem=select_best(candidates)
|
47
|
+
best_elem.text
|
45
48
|
end
|
46
49
|
|
47
|
-
def score_elem(
|
48
|
-
|
49
|
-
candidates.each do |elem|
|
50
|
+
def score_elem(bests)
|
51
|
+
bests.each do |elem|
|
50
52
|
base_score=100
|
51
53
|
str = "#{elem[:class]}#{elem[:id]}"
|
52
54
|
base_score+=10 if str =~ REGEXES[:maybe_post]
|
@@ -54,7 +56,7 @@ module ForumPost
|
|
54
56
|
base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
|
55
57
|
elem["score"]=base_score.to_s
|
56
58
|
end
|
57
|
-
|
59
|
+
bests
|
58
60
|
end
|
59
61
|
|
60
62
|
def select_best(candidates)
|
@@ -79,11 +81,11 @@ module ForumPost
|
|
79
81
|
last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
|
80
82
|
end
|
81
83
|
|
82
|
-
def most_likely_posts(
|
84
|
+
def most_likely_posts(better_post)
|
83
85
|
most_likely_posts=[]
|
84
|
-
|
86
|
+
better_post.each do |q|
|
85
87
|
flag=0
|
86
|
-
|
88
|
+
better_post.each do |p|
|
87
89
|
if is_contain(p,q) == true && p != q
|
88
90
|
flag+=1
|
89
91
|
end
|
@@ -93,6 +95,7 @@ module ForumPost
|
|
93
95
|
debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
|
94
96
|
end
|
95
97
|
end
|
98
|
+
debug("most_likely_posts:#{most_likely_posts.size}")
|
96
99
|
most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
|
97
100
|
end
|
98
101
|
|
@@ -128,7 +131,7 @@ module ForumPost
|
|
128
131
|
@html.css("*").each do |elem|
|
129
132
|
str = "#{elem[:class]}#{elem[:id]}"
|
130
133
|
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:maybe_post] && elem.name.downcase != 'body'
|
131
|
-
|
134
|
+
debug("Removing unlikely candidate - #{str}")
|
132
135
|
elem.remove
|
133
136
|
end
|
134
137
|
end
|
@@ -138,7 +141,7 @@ module ForumPost
|
|
138
141
|
@html.css("*").each do |elem|
|
139
142
|
if elem.name.downcase == "div"
|
140
143
|
if elem.inner_html !~ REGEXES[:divToPElementsRe]
|
141
|
-
|
144
|
+
#debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
142
145
|
elem.name = "p"
|
143
146
|
end
|
144
147
|
end
|
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forum_post
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 2
|
9
|
+
version: "0.2"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- ltl3884
|
@@ -46,7 +46,7 @@ files:
|
|
46
46
|
- README
|
47
47
|
- Rakefile
|
48
48
|
- lib/forum_post.rb
|
49
|
-
- lib/
|
49
|
+
- lib/forum_post_v0.2/init.rb
|
50
50
|
- resources/defaults.yaml
|
51
51
|
has_rdoc: true
|
52
52
|
homepage:
|