forum_post 0.2 → 0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/Rakefile +1 -1
  2. data/lib/forum_post.rb +28 -13
  3. metadata +4 -4
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/rdoctask'
6
6
  require 'fileutils'
7
7
  include FileUtils
8
8
 
9
- version="0.2"
9
+ version="0.3"
10
10
  name="forum_post"
11
11
 
12
12
  spec = Gem::Specification.new do |s|
data/lib/forum_post.rb CHANGED
@@ -9,7 +9,7 @@ module ForumPost
9
9
 
10
10
  REGEXES = {
11
11
  :unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
12
- :maybe_post => /article|body|column|main|content|post|topic|text|info|message|item/i,
12
+ :maybe_post => /article|body|column|main|content|post|topic|text|info|message|item|bord|forum/i,
13
13
  :not_post => /author|head|avatar|profile|rank|user|uid/i,
14
14
  :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
15
15
  :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
@@ -32,19 +32,24 @@ module ForumPost
32
32
  remove_script_and_style!#去除scrpt和style
33
33
  remove_unlikely_candidates!#去除不可能是发帖内容的标签
34
34
  transform_misused_divs_into_p!#把一些噪音的div转换成p标签
35
- bests=[]
36
35
  better_post = likely_posts
37
36
  if better_post.size > 0
38
37
  debug("post>4")
39
- @html = most_likely_posts(better_post)
38
+ handle_html=most_likely_posts(better_post)
40
39
  else
40
+ handle_html=@html
41
41
  debug("post<4")
42
42
  end
43
- bests=@html.css("div,tr,td")
44
- return @html.text.gsub(/\s(\s+)/,"") if bests.size==0
43
+ bests=handle_html.css("div,tr,td")
44
+ if bests.size==0
45
+ debug("best_one:#{bests.name}.#{bests[:class]} #{bests[:id]}")
46
+ return handle_html.text.gsub(/\s(\s+)/,"")
47
+ else
48
+ bests.map{|best| debug("bests:#{best.name}.#{best[:class]} #{best[:id]}")}
49
+ end
45
50
  candidates=score_elem(bests)
46
51
  best_elem=select_best(candidates)
47
- best_elem.text
52
+ best_elem.text.gsub(/\s(\s+)/,"")
48
53
  end
49
54
 
50
55
  def score_elem(bests)
@@ -56,6 +61,7 @@ module ForumPost
56
61
  base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
57
62
  elem["score"]=base_score.to_s
58
63
  end
64
+ bests.map{|best| debug("#{best.name}.#{best[:class]} #{best[:id]}---score:#{best['score']}")}
59
65
  bests
60
66
  end
61
67
 
@@ -74,10 +80,14 @@ module ForumPost
74
80
  end
75
81
  last_candidates<<p if flag==0
76
82
  end
77
- return last_candidates.first if last_candidates.size==1
83
+ if last_candidates.size==1
84
+ debug("best_one:#{last_candidates.first.name}.#{last_candidates.first[:class]} #{last_candidates.first[:id]}")
85
+ return last_candidates.first
86
+ end
78
87
  last_candidates.each do |lc|
79
88
  lc["text_rate"] = (elem_size(lc)/elem_size(lc,'inner_html').to_f).to_s
80
89
  end
90
+ last_candidates.map{|lc| debug("best_one:#{lc.name}.#{lc[:class]} #{lc[:id]}---text_rate:#{lc['text_rate']}")}
81
91
  last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
82
92
  end
83
93
 
@@ -95,7 +105,6 @@ module ForumPost
95
105
  debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
96
106
  end
97
107
  end
98
- debug("most_likely_posts:#{most_likely_posts.size}")
99
108
  most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
100
109
  end
101
110
 
@@ -111,18 +120,24 @@ module ForumPost
111
120
  str = "#{elem[:class]}#{elem[:id]}"
112
121
  if str =~ REGEXES[:maybe_post]
113
122
  flag="#{elem.name},#{elem[:class]}"
114
- if h.keys.include?(flag)
115
- h[flag] << elem
116
- else
117
- h[flag] = ([] << elem)
118
- end
123
+ collect_likely_elem(h,flag,elem)
119
124
  end
120
125
  end
121
126
  h.delete_if{|k,v| v.size < DEFAULT_OPTIONS[:min_length]}
122
127
  h.map{|k,v| likely_posts << v.first}
128
+ likely_posts.map{|lp| debug("likely_posts:#{lp.name}.#{lp[:class]} #{lp[:id]}")}
123
129
  likely_posts
124
130
  end
125
131
 
132
+ def collect_likely_elem(h,flag,elem)
133
+ flag.split(/ /).each do |item|
134
+ h.map do |k,v|
135
+ h[k] << elem if k =~ Regexp.new(item)
136
+ end
137
+ end
138
+ h[flag] = ([] << elem)
139
+ end
140
+
126
141
  def remove_script_and_style!
127
142
  @html.css("script, style").each { |i| i.remove }
128
143
  end
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: forum_post
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 13
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 2
9
- version: "0.2"
8
+ - 3
9
+ version: "0.3"
10
10
  platform: ruby
11
11
  authors:
12
12
  - ltl3884
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-08-04 00:00:00 +08:00
17
+ date: 2011-08-09 00:00:00 +08:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency