forum_post 0.2 → 0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/lib/forum_post.rb +28 -13
- metadata +4 -4
data/Rakefile
CHANGED
data/lib/forum_post.rb
CHANGED
@@ -9,7 +9,7 @@ module ForumPost
|
|
9
9
|
|
10
10
|
REGEXES = {
|
11
11
|
:unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
|
12
|
-
:maybe_post => /article|body|column|main|content|post|topic|text|info|message|item/i,
|
12
|
+
:maybe_post => /article|body|column|main|content|post|topic|text|info|message|item|bord|forum/i,
|
13
13
|
:not_post => /author|head|avatar|profile|rank|user|uid/i,
|
14
14
|
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
15
15
|
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
@@ -32,19 +32,24 @@ module ForumPost
|
|
32
32
|
remove_script_and_style!#去除scrpt和style
|
33
33
|
remove_unlikely_candidates!#去除不可能是发帖内容的标签
|
34
34
|
transform_misused_divs_into_p!#把一些噪音的div转换成p标签
|
35
|
-
bests=[]
|
36
35
|
better_post = likely_posts
|
37
36
|
if better_post.size > 0
|
38
37
|
debug("post>4")
|
39
|
-
|
38
|
+
handle_html=most_likely_posts(better_post)
|
40
39
|
else
|
40
|
+
handle_html=@html
|
41
41
|
debug("post<4")
|
42
42
|
end
|
43
|
-
bests
|
44
|
-
|
43
|
+
bests=handle_html.css("div,tr,td")
|
44
|
+
if bests.size==0
|
45
|
+
debug("best_one:#{bests.name}.#{bests[:class]} #{bests[:id]}")
|
46
|
+
return handle_html.text.gsub(/\s(\s+)/,"")
|
47
|
+
else
|
48
|
+
bests.map{|best| debug("bests:#{best.name}.#{best[:class]} #{best[:id]}")}
|
49
|
+
end
|
45
50
|
candidates=score_elem(bests)
|
46
51
|
best_elem=select_best(candidates)
|
47
|
-
best_elem.text
|
52
|
+
best_elem.text.gsub(/\s(\s+)/,"")
|
48
53
|
end
|
49
54
|
|
50
55
|
def score_elem(bests)
|
@@ -56,6 +61,7 @@ module ForumPost
|
|
56
61
|
base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
|
57
62
|
elem["score"]=base_score.to_s
|
58
63
|
end
|
64
|
+
bests.map{|best| debug("#{best.name}.#{best[:class]} #{best[:id]}---score:#{best['score']}")}
|
59
65
|
bests
|
60
66
|
end
|
61
67
|
|
@@ -74,10 +80,14 @@ module ForumPost
|
|
74
80
|
end
|
75
81
|
last_candidates<<p if flag==0
|
76
82
|
end
|
77
|
-
|
83
|
+
if last_candidates.size==1
|
84
|
+
debug("best_one:#{last_candidates.first.name}.#{last_candidates.first[:class]} #{last_candidates.first[:id]}")
|
85
|
+
return last_candidates.first
|
86
|
+
end
|
78
87
|
last_candidates.each do |lc|
|
79
88
|
lc["text_rate"] = (elem_size(lc)/elem_size(lc,'inner_html').to_f).to_s
|
80
89
|
end
|
90
|
+
last_candidates.map{|lc| debug("best_one:#{lc.name}.#{lc[:class]} #{lc[:id]}---text_rate:#{lc['text_rate']}")}
|
81
91
|
last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
|
82
92
|
end
|
83
93
|
|
@@ -95,7 +105,6 @@ module ForumPost
|
|
95
105
|
debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
|
96
106
|
end
|
97
107
|
end
|
98
|
-
debug("most_likely_posts:#{most_likely_posts.size}")
|
99
108
|
most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
|
100
109
|
end
|
101
110
|
|
@@ -111,18 +120,24 @@ module ForumPost
|
|
111
120
|
str = "#{elem[:class]}#{elem[:id]}"
|
112
121
|
if str =~ REGEXES[:maybe_post]
|
113
122
|
flag="#{elem.name},#{elem[:class]}"
|
114
|
-
|
115
|
-
h[flag] << elem
|
116
|
-
else
|
117
|
-
h[flag] = ([] << elem)
|
118
|
-
end
|
123
|
+
collect_likely_elem(h,flag,elem)
|
119
124
|
end
|
120
125
|
end
|
121
126
|
h.delete_if{|k,v| v.size < DEFAULT_OPTIONS[:min_length]}
|
122
127
|
h.map{|k,v| likely_posts << v.first}
|
128
|
+
likely_posts.map{|lp| debug("likely_posts:#{lp.name}.#{lp[:class]} #{lp[:id]}")}
|
123
129
|
likely_posts
|
124
130
|
end
|
125
131
|
|
132
|
+
def collect_likely_elem(h,flag,elem)
|
133
|
+
flag.split(/ /).each do |item|
|
134
|
+
h.map do |k,v|
|
135
|
+
h[k] << elem if k =~ Regexp.new(item)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
h[flag] = ([] << elem)
|
139
|
+
end
|
140
|
+
|
126
141
|
def remove_script_and_style!
|
127
142
|
@html.css("script, style").each { |i| i.remove }
|
128
143
|
end
|
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forum_post
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 13
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 3
|
9
|
+
version: "0.3"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- ltl3884
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-08-
|
17
|
+
date: 2011-08-09 00:00:00 +08:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|