forum_post 0.2 → 0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/Rakefile +1 -1
  2. data/lib/forum_post.rb +28 -13
  3. metadata +4 -4
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/rdoctask'
6
6
  require 'fileutils'
7
7
  include FileUtils
8
8
 
9
- version="0.2"
9
+ version="0.3"
10
10
  name="forum_post"
11
11
 
12
12
  spec = Gem::Specification.new do |s|
data/lib/forum_post.rb CHANGED
@@ -9,7 +9,7 @@ module ForumPost
9
9
 
10
10
  REGEXES = {
11
11
  :unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
12
- :maybe_post => /article|body|column|main|content|post|topic|text|info|message|item/i,
12
+ :maybe_post => /article|body|column|main|content|post|topic|text|info|message|item|bord|forum/i,
13
13
  :not_post => /author|head|avatar|profile|rank|user|uid/i,
14
14
  :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
15
15
  :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
@@ -32,19 +32,24 @@ module ForumPost
32
32
  remove_script_and_style!#去除scrpt和style
33
33
  remove_unlikely_candidates!#去除不可能是发帖内容的标签
34
34
  transform_misused_divs_into_p!#把一些噪音的div转换成p标签
35
- bests=[]
36
35
  better_post = likely_posts
37
36
  if better_post.size > 0
38
37
  debug("post>4")
39
- @html = most_likely_posts(better_post)
38
+ handle_html=most_likely_posts(better_post)
40
39
  else
40
+ handle_html=@html
41
41
  debug("post<4")
42
42
  end
43
- bests=@html.css("div,tr,td")
44
- return @html.text.gsub(/\s(\s+)/,"") if bests.size==0
43
+ bests=handle_html.css("div,tr,td")
44
+ if bests.size==0
45
+ debug("best_one:#{bests.name}.#{bests[:class]} #{bests[:id]}")
46
+ return handle_html.text.gsub(/\s(\s+)/,"")
47
+ else
48
+ bests.map{|best| debug("bests:#{best.name}.#{best[:class]} #{best[:id]}")}
49
+ end
45
50
  candidates=score_elem(bests)
46
51
  best_elem=select_best(candidates)
47
- best_elem.text
52
+ best_elem.text.gsub(/\s(\s+)/,"")
48
53
  end
49
54
 
50
55
  def score_elem(bests)
@@ -56,6 +61,7 @@ module ForumPost
56
61
  base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
57
62
  elem["score"]=base_score.to_s
58
63
  end
64
+ bests.map{|best| debug("#{best.name}.#{best[:class]} #{best[:id]}---score:#{best['score']}")}
59
65
  bests
60
66
  end
61
67
 
@@ -74,10 +80,14 @@ module ForumPost
74
80
  end
75
81
  last_candidates<<p if flag==0
76
82
  end
77
- return last_candidates.first if last_candidates.size==1
83
+ if last_candidates.size==1
84
+ debug("best_one:#{last_candidates.first.name}.#{last_candidates.first[:class]} #{last_candidates.first[:id]}")
85
+ return last_candidates.first
86
+ end
78
87
  last_candidates.each do |lc|
79
88
  lc["text_rate"] = (elem_size(lc)/elem_size(lc,'inner_html').to_f).to_s
80
89
  end
90
+ last_candidates.map{|lc| debug("best_one:#{lc.name}.#{lc[:class]} #{lc[:id]}---text_rate:#{lc['text_rate']}")}
81
91
  last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
82
92
  end
83
93
 
@@ -95,7 +105,6 @@ module ForumPost
95
105
  debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
96
106
  end
97
107
  end
98
- debug("most_likely_posts:#{most_likely_posts.size}")
99
108
  most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
100
109
  end
101
110
 
@@ -111,18 +120,24 @@ module ForumPost
111
120
  str = "#{elem[:class]}#{elem[:id]}"
112
121
  if str =~ REGEXES[:maybe_post]
113
122
  flag="#{elem.name},#{elem[:class]}"
114
- if h.keys.include?(flag)
115
- h[flag] << elem
116
- else
117
- h[flag] = ([] << elem)
118
- end
123
+ collect_likely_elem(h,flag,elem)
119
124
  end
120
125
  end
121
126
  h.delete_if{|k,v| v.size < DEFAULT_OPTIONS[:min_length]}
122
127
  h.map{|k,v| likely_posts << v.first}
128
+ likely_posts.map{|lp| debug("likely_posts:#{lp.name}.#{lp[:class]} #{lp[:id]}")}
123
129
  likely_posts
124
130
  end
125
131
 
132
+ def collect_likely_elem(h,flag,elem)
133
+ flag.split(/ /).each do |item|
134
+ h.map do |k,v|
135
+ h[k] << elem if k =~ Regexp.new(item)
136
+ end
137
+ end
138
+ h[flag] = ([] << elem)
139
+ end
140
+
126
141
  def remove_script_and_style!
127
142
  @html.css("script, style").each { |i| i.remove }
128
143
  end
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: forum_post
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 13
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
- version: "0.2"
8
+ - 3
9
+ version: "0.3"
10
10
  platform: ruby
11
11
  authors:
12
12
  - ltl3884
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-08-04 00:00:00 +08:00
17
+ date: 2011-08-09 00:00:00 +08:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency