forum_post 0.2 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/lib/forum_post.rb +28 -13
- metadata +4 -4
data/Rakefile
CHANGED
data/lib/forum_post.rb
CHANGED
@@ -9,7 +9,7 @@ module ForumPost
|
|
9
9
|
|
10
10
|
REGEXES = {
|
11
11
|
:unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
12
|
-
:maybe_post => /article|body|column|main|content|post|topic|text|info|message|item/i,
|
12
|
+
:maybe_post => /article|body|column|main|content|post|topic|text|info|message|item|bord|forum/i,
|
13
13
|
:not_post => /author|head|avatar|profile|rank|user|uid/i,
|
14
14
|
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
15
15
|
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
@@ -32,19 +32,24 @@ module ForumPost
|
|
32
32
|
remove_script_and_style!#去除scrpt和style
|
33
33
|
remove_unlikely_candidates!#去除不可能是发帖内容的标签
|
34
34
|
transform_misused_divs_into_p!#把一些噪音的div转换成p标签
|
35
|
-
bests=[]
|
36
35
|
better_post = likely_posts
|
37
36
|
if better_post.size > 0
|
38
37
|
debug("post>4")
|
39
|
-
|
38
|
+
handle_html=most_likely_posts(better_post)
|
40
39
|
else
|
40
|
+
handle_html=@html
|
41
41
|
debug("post<4")
|
42
42
|
end
|
43
|
-
bests
|
44
|
-
|
43
|
+
bests=handle_html.css("div,tr,td")
|
44
|
+
if bests.size==0
|
45
|
+
debug("best_one:#{bests.name}.#{bests[:class]} #{bests[:id]}")
|
46
|
+
return handle_html.text.gsub(/\s(\s+)/,"")
|
47
|
+
else
|
48
|
+
bests.map{|best| debug("bests:#{best.name}.#{best[:class]} #{best[:id]}")}
|
49
|
+
end
|
45
50
|
candidates=score_elem(bests)
|
46
51
|
best_elem=select_best(candidates)
|
47
|
-
best_elem.text
|
52
|
+
best_elem.text.gsub(/\s(\s+)/,"")
|
48
53
|
end
|
49
54
|
|
50
55
|
def score_elem(bests)
|
@@ -56,6 +61,7 @@ module ForumPost
|
|
56
61
|
base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
|
57
62
|
elem["score"]=base_score.to_s
|
58
63
|
end
|
64
|
+
bests.map{|best| debug("#{best.name}.#{best[:class]} #{best[:id]}---score:#{best['score']}")}
|
59
65
|
bests
|
60
66
|
end
|
61
67
|
|
@@ -74,10 +80,14 @@ module ForumPost
|
|
74
80
|
end
|
75
81
|
last_candidates<<p if flag==0
|
76
82
|
end
|
77
|
-
|
83
|
+
if last_candidates.size==1
|
84
|
+
debug("best_one:#{last_candidates.first.name}.#{last_candidates.first[:class]} #{last_candidates.first[:id]}")
|
85
|
+
return last_candidates.first
|
86
|
+
end
|
78
87
|
last_candidates.each do |lc|
|
79
88
|
lc["text_rate"] = (elem_size(lc)/elem_size(lc,'inner_html').to_f).to_s
|
80
89
|
end
|
90
|
+
last_candidates.map{|lc| debug("best_one:#{lc.name}.#{lc[:class]} #{lc[:id]}---text_rate:#{lc['text_rate']}")}
|
81
91
|
last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
|
82
92
|
end
|
83
93
|
|
@@ -95,7 +105,6 @@ module ForumPost
|
|
95
105
|
debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
|
96
106
|
end
|
97
107
|
end
|
98
|
-
debug("most_likely_posts:#{most_likely_posts.size}")
|
99
108
|
most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
|
100
109
|
end
|
101
110
|
|
@@ -111,18 +120,24 @@ module ForumPost
|
|
111
120
|
str = "#{elem[:class]}#{elem[:id]}"
|
112
121
|
if str =~ REGEXES[:maybe_post]
|
113
122
|
flag="#{elem.name},#{elem[:class]}"
|
114
|
-
|
115
|
-
h[flag] << elem
|
116
|
-
else
|
117
|
-
h[flag] = ([] << elem)
|
118
|
-
end
|
123
|
+
collect_likely_elem(h,flag,elem)
|
119
124
|
end
|
120
125
|
end
|
121
126
|
h.delete_if{|k,v| v.size < DEFAULT_OPTIONS[:min_length]}
|
122
127
|
h.map{|k,v| likely_posts << v.first}
|
128
|
+
likely_posts.map{|lp| debug("likely_posts:#{lp.name}.#{lp[:class]} #{lp[:id]}")}
|
123
129
|
likely_posts
|
124
130
|
end
|
125
131
|
|
132
|
+
def collect_likely_elem(h,flag,elem)
|
133
|
+
flag.split(/ /).each do |item|
|
134
|
+
h.map do |k,v|
|
135
|
+
h[k] << elem if k =~ Regexp.new(item)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
h[flag] = ([] << elem)
|
139
|
+
end
|
140
|
+
|
126
141
|
def remove_script_and_style!
|
127
142
|
@html.css("script, style").each { |i| i.remove }
|
128
143
|
end
|
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forum_post
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 13
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 3
|
9
|
+
version: "0.3"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- ltl3884
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-08-
|
17
|
+
date: 2011-08-09 00:00:00 +08:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|