forum_post 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/COPYING ADDED
@@ -0,0 +1 @@
1
+ No copying restrictions/license given.
data/LICENSE ADDED
@@ -0,0 +1 @@
1
+ No license given.
data/README ADDED
@@ -0,0 +1,5 @@
1
+ == Forum_post GemPlugin
2
+
3
+ You should document your project here.
4
+
5
+
data/Rakefile ADDED
@@ -0,0 +1,62 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/clean'
4
+ require 'rake/gempackagetask'
5
+ require 'rake/rdoctask'
6
+ require 'fileutils'
7
+ include FileUtils
8
+
9
+ version="0.1"
10
+ name="forum_post"
11
+
12
+ spec = Gem::Specification.new do |s|
13
+ s.name = name
14
+ s.version = version
15
+ s.description = s.summary = "Get the forum post"
16
+ s.author = "ltl3884"
17
+ s.email = "ltl3884@gmail.com"
18
+ s.add_dependency('nokogiri', '>= 1.4')
19
+
20
+ s.platform = Gem::Platform::RUBY
21
+ s.has_rdoc = true
22
+ s.extra_rdoc_files = ["README"]
23
+
24
+ s.files = %w(COPYING LICENSE README Rakefile) +
25
+ Dir.glob("{bin,doc/rdoc,test,lib}/**/*") +
26
+ Dir.glob("ext/**/*.{h,c,rb}") +
27
+ Dir.glob("examples/**/*.rb") +
28
+ Dir.glob("tools/*.rb") +
29
+ Dir.glob("resources/**/*")
30
+
31
+ s.require_path = "lib"
32
+ s.bindir = "bin"
33
+ end
34
+
35
+ Rake::GemPackageTask.new(spec) do |p|
36
+ p.need_tar = true if RUBY_PLATFORM !~ /mswin/
37
+ end
38
+
39
+ task :install => [:test, :package] do
40
+ sh %{sudo gem install pkg/#{name}-#{version}.gem}
41
+ end
42
+
43
+ task :uninstall => [:clean] do
44
+ sh %{sudo gem uninstall #{name}}
45
+ end
46
+
47
+ Rake::TestTask.new do |t|
48
+ t.libs << "test"
49
+ t.test_files = FileList['test/test*.rb']
50
+ t.verbose = true
51
+ end
52
+
53
+ Rake::RDocTask.new do |rdoc|
54
+ rdoc.rdoc_dir = 'doc/rdoc'
55
+ rdoc.options << '--line-numbers'
56
+ rdoc.rdoc_files.add ['README', 'LICENSE', 'COPYING', 'lib/**/*.rb', 'doc/**/*.rdoc']
57
+ end
58
+
59
+ task :default => [:test, :package]
60
+
61
+ CLEAN.include ['build/*', '**/*.o', '**/*.so', '**/*.a', 'lib/*-*', '**/*.log', 'pkg', 'lib/*.bundle', '*.gem', '.config']
62
+
@@ -0,0 +1,6 @@
1
+ require 'gem_plugin'
2
+
3
+ # give this class the name you want for your command forum_post
4
+ class ChangeME < GemPlugin::Plugin "/somecategory"
5
+ end
6
+
data/lib/forum_post.rb ADDED
@@ -0,0 +1,174 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ module ForumPost
4
+ class Document
5
+ DEFAULT_OPTIONS = {
6
+ :min_length => 5,
7
+ :min_text_length => 15
8
+ }.freeze
9
+
10
+ REGEXES = {
11
+ :unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
12
+ :maybe_post => /article|body|column|main|content|post|topic|text|info|message/i,
13
+ :not_post => /author|head|avatar|profile|rank|user|uid/i,
14
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
15
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
16
+ :replaceFontsRe => /<(\/?)font[^>]*>/i
17
+ }
18
+
19
+ attr_accessor :options, :html
20
+
21
+ def initialize(input, options = {})
22
+ @input = input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
23
+ @options = DEFAULT_OPTIONS.merge(options)
24
+ make_html
25
+ end
26
+
27
+ def make_html
28
+ @html = Nokogiri::HTML(@input, nil, 'UTF-8')
29
+ end
30
+
31
+ def content
32
+ remove_script_and_style!#去除scrpt和style
33
+ remove_unlikely_candidates!#去除不可能是发帖内容的标签
34
+ transform_misused_divs_into_p!#把一些噪音的div转换成p标签
35
+ if likely_posts.size > 0
36
+ #回帖数大于5
37
+ most_likely_post = most_likely_posts(likely_posts)
38
+ candidates=score_elem(most_likely_post)
39
+ else
40
+ #回帖数小于5
41
+ candidates=score_elem(@html)
42
+ end
43
+ best_elem=select_best(candidates)
44
+ best_elem.text
45
+ end
46
+
47
+ def score_elem(most_likely_post)
48
+ candidates=most_likely_post.css("div,tr,td")
49
+ candidates.each do |elem|
50
+ base_score=100
51
+ str = "#{elem[:class]}#{elem[:id]}"
52
+ base_score+=10 if str =~ REGEXES[:maybe_post]
53
+ base_score-=20 if str =~ REGEXES[:not_post]
54
+ base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length]
55
+ elem["score"]=base_score.to_s
56
+ end
57
+ candidates
58
+ end
59
+
60
+ def select_best(candidates)
61
+ last_candidates=[]
62
+ candidates=candidates.sort{|a,b| b["score"].to_i<=>a["score"].to_i}
63
+ best_score=candidates.first["score"]
64
+ candidates.delete_if{|c| c["score"]!=best_score}
65
+ return candidates.first if candidates.size==1
66
+ candidates.each do |p|
67
+ flag=0
68
+ candidates.each do |q|
69
+ if is_contain(p,q) == true && p != q
70
+ flag+=1
71
+ end
72
+ end
73
+ last_candidates<<p if flag==0
74
+ end
75
+ return last_candidates.first if last_candidates.size==1
76
+ last_candidates.each do |lc|
77
+ lc["text_rate"] = (elem_size(lc)/elem_size(lc,'inner_html').to_f).to_s
78
+ end
79
+ last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first
80
+ end
81
+
82
+ def most_likely_posts(likely_posts)
83
+ most_likely_posts=[]
84
+ likely_posts.each do |q|
85
+ flag=0
86
+ likely_posts.each do |p|
87
+ if is_contain(p,q) == true && p != q
88
+ flag+=1
89
+ end
90
+ end
91
+ if flag == 0
92
+ most_likely_posts << q
93
+ debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}")
94
+ end
95
+ end
96
+ most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first
97
+ end
98
+
99
+ def elem_size(elem,type='inner_text')
100
+ return elem.text.gsub(/\s(\s+)/,"").size if type=='inner_text'
101
+ return elem.inner_html.gsub(/\s(\s+)/,"").size if type=='inner_html'
102
+ end
103
+
104
+ def likely_posts
105
+ h={}
106
+ likely_posts=[]
107
+ @html.css("div,tr,td").each do |elem|
108
+ str = "#{elem[:class]}#{elem[:id]}"
109
+ if str =~ REGEXES[:maybe_post]
110
+ flag="#{elem.name},#{elem[:class]}"
111
+ if h.keys.include?(flag)
112
+ h[flag] << elem
113
+ else
114
+ h[flag] = ([] << elem)
115
+ end
116
+ end
117
+ end
118
+ h.delete_if{|k,v| v.size < DEFAULT_OPTIONS[:min_length]}
119
+ h.map{|k,v| likely_posts << v.first}
120
+ likely_posts
121
+ end
122
+
123
+ def remove_script_and_style!
124
+ @html.css("script, style").each { |i| i.remove }
125
+ end
126
+
127
+ def remove_unlikely_candidates!
128
+ @html.css("*").each do |elem|
129
+ str = "#{elem[:class]}#{elem[:id]}"
130
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:maybe_post] && elem.name.downcase != 'body'
131
+ debug("Removing unlikely candidate - #{str}")
132
+ elem.remove
133
+ end
134
+ end
135
+ end
136
+
137
+ def transform_misused_divs_into_p!
138
+ @html.css("*").each do |elem|
139
+ if elem.name.downcase == "div"
140
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
141
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
142
+ elem.name = "p"
143
+ end
144
+ end
145
+ end
146
+ end
147
+
148
+ def is_contain(p,q)
149
+ p.css("div,tr,td").each do |item|
150
+ return true if item.name == q.name && item[:class] == q[:class]
151
+ end
152
+ return false
153
+ end
154
+
155
+ def debug(str)
156
+ puts str if options[:debug]
157
+ end
158
+
159
+ end
160
+ end
161
+
162
+
163
+ #require 'open-uri'
164
+ #include ForumPost
165
+ #uri = 'http://topic.csdn.net/u/20110718/17/f1a523fb-8c65-4510-a094-daf7bd2698cf.html?50656'
166
+ #uri = 'http://bbs.lampchina.net/thread-29528-1-1.html'
167
+ #uri = 'http://bbs.55bbs.com/thread-5662509-1-1.html'
168
+
169
+ #uri='http://topic.csdn.net/u/20110803/11/7ef08712-129d-4971-88a4-4bd0b712c804.html?90401'
170
+ #f = open(uri).read
171
+ #charset = adjudge_html_charset(f)
172
+ #source = convert_to_utf8(f,charset)
173
+ #d = Document.new(source)
174
+ #puts d.content
@@ -0,0 +1,2 @@
1
+ ---
2
+ :debug: false
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: forum_post
3
+ version: !ruby/object:Gem::Version
4
+ hash: 9
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ version: "0.1"
10
+ platform: ruby
11
+ authors:
12
+ - ltl3884
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-08-04 00:00:00 +08:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: nokogiri
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ hash: 7
29
+ segments:
30
+ - 1
31
+ - 4
32
+ version: "1.4"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: Get the forum post
36
+ email: ltl3884@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - README
43
+ files:
44
+ - COPYING
45
+ - LICENSE
46
+ - README
47
+ - Rakefile
48
+ - lib/forum_post.rb
49
+ - lib/forum_post/init.rb
50
+ - resources/defaults.yaml
51
+ has_rdoc: true
52
+ homepage:
53
+ licenses: []
54
+
55
+ post_install_message:
56
+ rdoc_options: []
57
+
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ hash: 3
66
+ segments:
67
+ - 0
68
+ version: "0"
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ hash: 3
75
+ segments:
76
+ - 0
77
+ version: "0"
78
+ requirements: []
79
+
80
+ rubyforge_project:
81
+ rubygems_version: 1.3.7
82
+ signing_key:
83
+ specification_version: 3
84
+ summary: Get the forum post
85
+ test_files: []
86
+