forum_post 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README CHANGED
@@ -1,5 +1,15 @@
1
- == Forum_post GemPlugin
1
+ == Forum_post_v0.2/ GemPlugin
2
2
 
3
- You should document your project here.
3
+ the gem used to get the forum post
4
4
 
5
+ install:
6
+ gem install forum_post
7
+ example:
8
+ require 'rubygems'
9
+ require 'forum_post'
10
+ require 'open-uri'
11
+ uri='http://topic.csdn.net/u/20110718/17/f1a523fb-8c65-4510-a094-daf7bd2698cf.html?50656'
12
+ source=open(uri).read
13
+ doc=ForumPost::Document.new(source)
14
+ puts doc.content
5
15
 
data/Rakefile CHANGED
@@ -6,7 +6,7 @@ require 'rake/rdoctask'
6
6
  require 'fileutils'
7
7
  include FileUtils
8
8
 
9
- version="0.1"
9
+ version="0.2"
10
10
  name="forum_post"
11
11
 
12
12
  spec = Gem::Specification.new do |s|
data/lib/forum_post.rb CHANGED
@@ -9,7 +9,7 @@ module ForumPost
9
9
 
10
10
  REGEXES = {
11
11
  :unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
12
- :maybe_post => /article|body|column|main|content|post|topic|text|info|message/i,
12
+ :maybe_post => /article|body|column|main|content|post|topic|text|info|message|item/i,
13
13
  :not_post => /author|head|avatar|profile|rank|user|uid/i,
14
14
  :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
15
15
  :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
@@ -32,21 +32,23 @@ module ForumPost
32
32
  remove_script_and_style!#去除scrpt和style
33
33
  remove_unlikely_candidates!#去除不可能是发帖内容的标签
34
34
  transform_misused_divs_into_p!#把一些噪音的div转换成p标签
35
- if likely_posts.size > 0
36
- #回帖数大于5
37
- most_likely_post = most_likely_posts(likely_posts)
38
- candidates=score_elem(most_likely_post)
39
- else
40
- #回帖数小于5
41
- candidates=score_elem(@html)
42
- end
43
- best_elem=select_best(candidates)
44
- best_elem.text
35
+ bests=[]
36
+ better_post = likely_posts
37
+ if better_post.size > 0
38
+ debug("post>4")
39
+ @html = most_likely_posts(better_post)
40
+ else
41
+ debug("post<4")
42
+ end
43
+ bests=@html.css("div,tr,td")
44
+ return @html.text.gsub(/\s(\s+)/,"") if bests.size==0
45
+ candidates=score_elem(bests)
46
+ best_elem=select_best(candidates)
47
+ best_elem.text
45
48
  end
46
49
 
47
- def score_elem(most_likely_post)
48
- candidates=most_likely_post.css("div,tr,td")
49
- candidates.each do |elem|
50
+ def score_elem(bests)
51
+ bests.each do |elem|
50
52
  base_score=100
51
53
  str = "#{elem[:class]}#{elem[:id]}"
52
54
  base_score+=10 if str =~ REGEXES[:maybe_post]
@@ -54,7 +56,7 @@ module ForumPost
54
56
  base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
55
57
  elem["score"]=base_score.to_s
56
58
  end
57
- candidates
59
+ bests
58
60
  end
59
61
 
60
62
  def select_best(candidates)
@@ -79,11 +81,11 @@ module ForumPost
79
81
  last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
80
82
  end
81
83
 
82
- def most_likely_posts(likely_posts)
84
+ def most_likely_posts(better_post)
83
85
  most_likely_posts=[]
84
- likely_posts.each do |q|
86
+ better_post.each do |q|
85
87
  flag=0
86
- likely_posts.each do |p|
88
+ better_post.each do |p|
87
89
  if is_contain(p,q) == true && p != q
88
90
  flag+=1
89
91
  end
@@ -93,6 +95,7 @@ module ForumPost
93
95
  debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
94
96
  end
95
97
  end
98
+ debug("most_likely_posts:#{most_likely_posts.size}")
96
99
  most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
97
100
  end
98
101
 
@@ -128,7 +131,7 @@ module ForumPost
128
131
  @html.css("*").each do |elem|
129
132
  str = "#{elem[:class]}#{elem[:id]}"
130
133
  if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:maybe_post] && elem.name.downcase != 'body'
131
- debug("Removing unlikely candidate - #{str}")
134
+ debug("Removing unlikely candidate - #{str}")
132
135
  elem.remove
133
136
  end
134
137
  end
@@ -138,7 +141,7 @@ module ForumPost
138
141
  @html.css("*").each do |elem|
139
142
  if elem.name.downcase == "div"
140
143
  if elem.inner_html !~ REGEXES[:divToPElementsRe]
141
- debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
144
+ #debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
142
145
  elem.name = "p"
143
146
  end
144
147
  end
@@ -1,6 +1,6 @@
1
1
  require 'gem_plugin'
2
2
 
3
- # give this class the name you want for your command forum_post
3
+ # give this class the name you want for your command forum_post_v0.2/
4
4
  class ChangeME < GemPlugin::Plugin "/somecategory"
5
5
  end
6
6
 
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: forum_post
3
3
  version: !ruby/object:Gem::Version
4
- hash: 9
4
+ hash: 15
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
- - 1
9
- version: "0.1"
8
+ - 2
9
+ version: "0.2"
10
10
  platform: ruby
11
11
  authors:
12
12
  - ltl3884
@@ -46,7 +46,7 @@ files:
46
46
  - README
47
47
  - Rakefile
48
48
  - lib/forum_post.rb
49
- - lib/forum_post/init.rb
49
+ - lib/forum_post_v0.2/init.rb
50
50
  - resources/defaults.yaml
51
51
  has_rdoc: true
52
52
  homepage: