forum_post 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/COPYING ADDED
@@ -0,0 +1 @@
1
+ No copying restrictions/license given.
data/LICENSE ADDED
@@ -0,0 +1 @@
1
+ No license given.
data/README ADDED
@@ -0,0 +1,5 @@
1
+ == Forum_post GemPlugin
2
+
3
+ You should document your project here.
4
+
5
+
data/Rakefile ADDED
@@ -0,0 +1,62 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/clean'
4
+ require 'rake/gempackagetask'
5
+ require 'rake/rdoctask'
6
+ require 'fileutils'
7
+ include FileUtils
8
+
9
+ version="0.1"
10
+ name="forum_post"
11
+
12
+ spec = Gem::Specification.new do |s|
13
+ s.name = name
14
+ s.version = version
15
+ s.description = s.summary = "Get the forum post"
16
+ s.author = "ltl3884"
17
+ s.email = "ltl3884@gmail.com"
18
+ s.add_dependency('nokogiri', '>= 1.4')
19
+
20
+ s.platform = Gem::Platform::RUBY
21
+ s.has_rdoc = true
22
+ s.extra_rdoc_files = ["README"]
23
+
24
+ s.files = %w(COPYING LICENSE README Rakefile) +
25
+ Dir.glob("{bin,doc/rdoc,test,lib}/**/*") +
26
+ Dir.glob("ext/**/*.{h,c,rb}") +
27
+ Dir.glob("examples/**/*.rb") +
28
+ Dir.glob("tools/*.rb") +
29
+ Dir.glob("resources/**/*")
30
+
31
+ s.require_path = "lib"
32
+ s.bindir = "bin"
33
+ end
34
+
35
+ Rake::GemPackageTask.new(spec) do |p|
36
+ p.need_tar = true if RUBY_PLATFORM !~ /mswin/
37
+ end
38
+
39
+ task :install => [:test, :package] do
40
+ sh %{sudo gem install pkg/#{name}-#{version}.gem}
41
+ end
42
+
43
+ task :uninstall => [:clean] do
44
+ sh %{sudo gem uninstall #{name}}
45
+ end
46
+
47
+ Rake::TestTask.new do |t|
48
+ t.libs << "test"
49
+ t.test_files = FileList['test/test*.rb']
50
+ t.verbose = true
51
+ end
52
+
53
+ Rake::RDocTask.new do |rdoc|
54
+ rdoc.rdoc_dir = 'doc/rdoc'
55
+ rdoc.options << '--line-numbers'
56
+ rdoc.rdoc_files.add ['README', 'LICENSE', 'COPYING', 'lib/**/*.rb', 'doc/**/*.rdoc']
57
+ end
58
+
59
+ task :default => [:test, :package]
60
+
61
+ CLEAN.include ['build/*', '**/*.o', '**/*.so', '**/*.a', 'lib/*-*', '**/*.log', 'pkg', 'lib/*.bundle', '*.gem', '.config']
62
+
@@ -0,0 +1,6 @@
1
+ require 'gem_plugin'
2
+
3
+ # give this class the name you want for your command forum_post
4
+ class ChangeME < GemPlugin::Plugin "/somecategory"
5
+ end
6
+
data/lib/forum_post.rb ADDED
@@ -0,0 +1,174 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ module ForumPost
4
+ class Document
5
+ DEFAULT_OPTIONS = {
6
+ :min_length => 5,
7
+ :min_text_length => 15
8
+ }.freeze
9
+
10
+ REGEXES = {
11
+ :unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
12
+ :maybe_post => /article|body|column|main|content|post|topic|text|info|message/i,
13
+ :not_post => /author|head|avatar|profile|rank|user|uid/i,
14
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
15
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
16
+ :replaceFontsRe => /<(\/?)font[^>]*>/i
17
+ }
18
+
19
+ attr_accessor :options, :html
20
+
21
+ def initialize(input, options = {})
22
+ @input = input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
23
+ @options = DEFAULT_OPTIONS.merge(options)
24
+ make_html
25
+ end
26
+
27
+ def make_html
28
+ @html = Nokogiri::HTML(@input, nil, 'UTF-8')
29
+ end
30
+
31
+ def content
32
+ remove_script_and_style!#去除scrpt和style
33
+ remove_unlikely_candidates!#去除不可能是发帖内容的标签
34
+ transform_misused_divs_into_p!#把一些噪音的div转换成p标签
35
+ if likely_posts.size > 0
36
+ #回帖数大于5
37
+ most_likely_post = most_likely_posts(likely_posts)
38
+ candidates=score_elem(most_likely_post)
39
+ else
40
+ #回帖数小于5
41
+ candidates=score_elem(@html)
42
+ end
43
+ best_elem=select_best(candidates)
44
+ best_elem.text
45
+ end
46
+
47
+ def score_elem(most_likely_post)
48
+ candidates=most_likely_post.css("div,tr,td")
49
+ candidates.each do |elem|
50
+ base_score=100
51
+ str = "#{elem[:class]}#{elem[:id]}"
52
+ base_score+=10 if str =~ REGEXES[:maybe_post]
53
+ base_score-=20 if str =~ REGEXES[:not_post]
54
+ base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
55
+ elem["score"]=base_score.to_s
56
+ end
57
+ candidates
58
+ end
59
+
60
+ def select_best(candidates)
61
+ last_candidates=[]
62
+ candidates=candidates.sort{|a,b| b["score"].to_i<=>a["score"].to_i}
63
+ best_score=candidates.first["score"]
64
+ candidates.delete_if{|c| c["score"]!=best_score}
65
+ return candidates.first if candidates.size==1
66
+ candidates.each do |p|
67
+ flag=0
68
+ candidates.each do |q|
69
+ if is_contain(p,q) == true && p != q
70
+ flag+=1
71
+ end
72
+ end
73
+ last_candidates<<p if flag==0
74
+ end
75
+ return last_candidates.first if last_candidates.size==1
76
+ last_candidates.each do |lc|
77
+ lc["text_rate"] = (elem_size(lc)/elem_size(lc,'inner_html').to_f).to_s
78
+ end
79
+ last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
80
+ end
81
+
82
+ def most_likely_posts(likely_posts)
83
+ most_likely_posts=[]
84
+ likely_posts.each do |q|
85
+ flag=0
86
+ likely_posts.each do |p|
87
+ if is_contain(p,q) == true && p != q
88
+ flag+=1
89
+ end
90
+ end
91
+ if flag == 0
92
+ most_likely_posts << q
93
+ debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
94
+ end
95
+ end
96
+ most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
97
+ end
98
+
99
+ def elem_size(elem,type='inner_text')
100
+ return elem.text.gsub(/\s(\s+)/,"").size if type=='inner_text'
101
+ return elem.inner_html.gsub(/\s(\s+)/,"").size if type=='inner_html'
102
+ end
103
+
104
+ def likely_posts
105
+ h={}
106
+ likely_posts=[]
107
+ @html.css("div,tr,td").each do |elem|
108
+ str = "#{elem[:class]}#{elem[:id]}"
109
+ if str =~ REGEXES[:maybe_post]
110
+ flag="#{elem.name},#{elem[:class]}"
111
+ if h.keys.include?(flag)
112
+ h[flag] << elem
113
+ else
114
+ h[flag] = ([] << elem)
115
+ end
116
+ end
117
+ end
118
+ h.delete_if{|k,v| v.size < DEFAULT_OPTIONS[:min_length]}
119
+ h.map{|k,v| likely_posts << v.first}
120
+ likely_posts
121
+ end
122
+
123
+ def remove_script_and_style!
124
+ @html.css("script, style").each { |i| i.remove }
125
+ end
126
+
127
+ def remove_unlikely_candidates!
128
+ @html.css("*").each do |elem|
129
+ str = "#{elem[:class]}#{elem[:id]}"
130
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:maybe_post] && elem.name.downcase != 'body'
131
+ debug("Removing unlikely candidate - #{str}")
132
+ elem.remove
133
+ end
134
+ end
135
+ end
136
+
137
+ def transform_misused_divs_into_p!
138
+ @html.css("*").each do |elem|
139
+ if elem.name.downcase == "div"
140
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
141
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
142
+ elem.name = "p"
143
+ end
144
+ end
145
+ end
146
+ end
147
+
148
+ def is_contain(p,q)
149
+ p.css("div,tr,td").each do |item|
150
+ return true if item.name == q.name && item[:class] == q[:class]
151
+ end
152
+ return false
153
+ end
154
+
155
+ def debug(str)
156
+ puts str if options[:debug]
157
+ end
158
+
159
+ end
160
+ end
161
+
162
+
163
+ #require 'open-uri'
164
+ #include ForumPost
165
+ #uri = 'http://topic.csdn.net/u/20110718/17/f1a523fb-8c65-4510-a094-daf7bd2698cf.html?50656'
166
+ #uri = 'http://bbs.lampchina.net/thread-29528-1-1.html'
167
+ #uri = 'http://bbs.55bbs.com/thread-5662509-1-1.html'
168
+
169
+ #uri='http://topic.csdn.net/u/20110803/11/7ef08712-129d-4971-88a4-4bd0b712c804.html?90401'
170
+ #f = open(uri).read
171
+ #charset = adjudge_html_charset(f)
172
+ #source = convert_to_utf8(f,charset)
173
+ #d = Document.new(source)
174
+ #puts d.content
@@ -0,0 +1,2 @@
1
+ ---
2
+ :debug: false
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: forum_post
3
+ version: !ruby/object:Gem::Version
4
+ hash: 9
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ version: "0.1"
10
+ platform: ruby
11
+ authors:
12
+ - ltl3884
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-08-04 00:00:00 +08:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: nokogiri
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 7
29
+ segments:
30
+ - 1
31
+ - 4
32
+ version: "1.4"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: Get the forum post
36
+ email: ltl3884@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - README
43
+ files:
44
+ - COPYING
45
+ - LICENSE
46
+ - README
47
+ - Rakefile
48
+ - lib/forum_post.rb
49
+ - lib/forum_post/init.rb
50
+ - resources/defaults.yaml
51
+ has_rdoc: true
52
+ homepage:
53
+ licenses: []
54
+
55
+ post_install_message:
56
+ rdoc_options: []
57
+
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ hash: 3
66
+ segments:
67
+ - 0
68
+ version: "0"
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ hash: 3
75
+ segments:
76
+ - 0
77
+ version: "0"
78
+ requirements: []
79
+
80
+ rubyforge_project:
81
+ rubygems_version: 1.3.7
82
+ signing_key:
83
+ specification_version: 3
84
+ summary: Get the forum post
85
+ test_files: []
86
+