forum_post 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -1,5 +1,15 @@
1
- == Forum_post GemPlugin
1
+ == Forum_post_v0.2/ GemPlugin
2
2
 
3
- You should document your project here.
3
+ the gem used to get the forum post
4
4
 
5
+ install:
6
+ gem install forum_post
7
+ example:
8
+ require 'rubygems'
9
+ require 'forum_post'
10
+ require 'open-uri'
11
+ uri='http://topic.csdn.net/u/20110718/17/f1a523fb-8c65-4510-a094-daf7bd2698cf.html?50656'
12
+ source=open(uri).read
13
+ doc=ForumPost::Document.new(source)
14
+ puts doc.content
5
15
 
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/rdoctask'
6
6
  require 'fileutils'
7
7
  include FileUtils
8
8
 
9
- version="0.1"
9
+ version="0.2"
10
10
  name="forum_post"
11
11
 
12
12
  spec = Gem::Specification.new do |s|
data/lib/forum_post.rb CHANGED
@@ -9,7 +9,7 @@ module ForumPost
9
9
 
10
10
  REGEXES = {
11
11
  :unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
12
- :maybe_post => /article|body|column|main|content|post|topic|text|info|message/i,
12
+ :maybe_post => /article|body|column|main|content|post|topic|text|info|message|item/i,
13
13
  :not_post => /author|head|avatar|profile|rank|user|uid/i,
14
14
  :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
15
15
  :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
@@ -32,21 +32,23 @@ module ForumPost
32
32
  remove_script_and_style!#去除scrpt和style
33
33
  remove_unlikely_candidates!#去除不可能是发帖内容的标签
34
34
  transform_misused_divs_into_p!#把一些噪音的div转换成p标签
35
- if likely_posts.size > 0
36
- #回帖数大于5
37
- most_likely_post = most_likely_posts(likely_posts)
38
- candidates=score_elem(most_likely_post)
39
- else
40
- #回帖数小于5
41
- candidates=score_elem(@html)
42
- end
43
- best_elem=select_best(candidates)
44
- best_elem.text
35
+ bests=[]
36
+ better_post = likely_posts
37
+ if better_post.size > 0
38
+ debug("post>4")
39
+ @html = most_likely_posts(better_post)
40
+ else
41
+ debug("post<4")
42
+ end
43
+ bests=@html.css("div,tr,td")
44
+ return @html.text.gsub(/\s(\s+)/,"") if bests.size==0
45
+ candidates=score_elem(bests)
46
+ best_elem=select_best(candidates)
47
+ best_elem.text
45
48
  end
46
49
 
47
- def score_elem(most_likely_post)
48
- candidates=most_likely_post.css("div,tr,td")
49
- candidates.each do |elem|
50
+ def score_elem(bests)
51
+ bests.each do |elem|
50
52
  base_score=100
51
53
  str = "#{elem[:class]}#{elem[:id]}"
52
54
  base_score+=10 if str =~ REGEXES[:maybe_post]
@@ -54,7 +56,7 @@ module ForumPost
54
56
  base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
55
57
  elem["score"]=base_score.to_s
56
58
  end
57
- candidates
59
+ bests
58
60
  end
59
61
 
60
62
  def select_best(candidates)
@@ -79,11 +81,11 @@ module ForumPost
79
81
  last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
80
82
  end
81
83
 
82
- def most_likely_posts(likely_posts)
84
+ def most_likely_posts(better_post)
83
85
  most_likely_posts=[]
84
- likely_posts.each do |q|
86
+ better_post.each do |q|
85
87
  flag=0
86
- likely_posts.each do |p|
88
+ better_post.each do |p|
87
89
  if is_contain(p,q) == true && p != q
88
90
  flag+=1
89
91
  end
@@ -93,6 +95,7 @@ module ForumPost
93
95
  debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
94
96
  end
95
97
  end
98
+ debug("most_likely_posts:#{most_likely_posts.size}")
96
99
  most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
97
100
  end
98
101
 
@@ -128,7 +131,7 @@ module ForumPost
128
131
  @html.css("*").each do |elem|
129
132
  str = "#{elem[:class]}#{elem[:id]}"
130
133
  if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:maybe_post] && elem.name.downcase != 'body'
131
- debug("Removing unlikely candidate - #{str}")
134
+ debug("Removing unlikely candidate - #{str}")
132
135
  elem.remove
133
136
  end
134
137
  end
@@ -138,7 +141,7 @@ module ForumPost
138
141
  @html.css("*").each do |elem|
139
142
  if elem.name.downcase == "div"
140
143
  if elem.inner_html !~ REGEXES[:divToPElementsRe]
141
- debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
144
+ #debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
142
145
  elem.name = "p"
143
146
  end
144
147
  end
@@ -1,6 +1,6 @@
1
1
  require 'gem_plugin'
2
2
 
3
- # give this class the name you want for your command forum_post
3
+ # give this class the name you want for your command forum_post_v0.2/
4
4
  class ChangeME < GemPlugin::Plugin "/somecategory"
5
5
  end
6
6
 
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: forum_post
3
3
  version: !ruby/object:Gem::Version
4
- hash: 9
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 1
9
- version: "0.1"
8
+ - 2
9
+ version: "0.2"
10
10
  platform: ruby
11
11
  authors:
12
12
  - ltl3884
@@ -46,7 +46,7 @@ files:
46
46
  - README
47
47
  - Rakefile
48
48
  - lib/forum_post.rb
49
- - lib/forum_post/init.rb
49
+ - lib/forum_post_v0.2/init.rb
50
50
  - resources/defaults.yaml
51
51
  has_rdoc: true
52
52
  homepage: