htmlclipping 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/htmlclipping.rb +243 -0
  2. data/lib/htmlclipping.rb~ +208 -0
  3. metadata +41 -0
@@ -0,0 +1,243 @@
1
+ # HtmlClipping generates excerpts from an HTML page that has a link pointing to
2
+ # a particular URI. It removes most HTML markup, bolds the link text, and
3
+ # trims the resulting text to a fixed number of characters. I developed it to
4
+ # help me track referers to my website, though I suppose it might have other
5
+ # uses.
6
+ #
7
+ # For example, the following script gets the HTML at http://rubyforge.org/credits/, and forms an excerpt around the link to http://www.rubycentral.org/pledge/.
8
+ #
9
+ # require 'htmlclipping'
10
+ # require 'net/http'
11
+ #
12
+ # contents = ''
13
+ # Net::HTTP.start( 'rubyforge.org' ) do |http|
14
+ # response = http.get '/credits/'
15
+ # contents = response.body
16
+ # end
17
+ # clipping = HtmlClipping.new(
18
+ # contents, 'http://www.rubycentral.org/pledge/', 500
19
+ # )
20
+ # puts clipping.to_s
21
+ #
22
+ # => "… RubyForge takes time, effort, and money. Many thanks to the
23
+ # folks listed below who are making it possible! <br /> If RubyForge has
24
+ # been helpful to you, and you want to give something back to the Ruby
25
+ # community, please consider supporting <strong>RubyCentral</strong>.
26
+ # Thanks! <br /> InfoEther, Inc purchased the RubyForge hardware and
27
+ # provides system administration support. <br /> Several folks provide
28
+ # file mirrors to help share the bandwidth load: <br /> Evan Webb <br />
29
+ # Dennis Oelkers <br /> Austin &#8230;"
30
+ #
31
+ # The Rubyforge project page can be found at http://rubyforge.org/projects/htmlclipping.
32
+
33
+ require 'iconv'
34
+
35
+ class HtmlClipping
36
+ Version = '0.1.0'
37
+
38
+ # html:: The HTML of the referring web page.
39
+ # referred_uri:: The URI that is being referred to.
40
+ # excerpt_limit:: The maximum size of the resulting clipping
41
+ def initialize( html, referred_uri, excerpt_limit )
42
+ @contents = html
43
+ if @contents =~ %r{<meta.*text/html; charset=(.*?)('|")}i
44
+ @converter = Iconv.new( 'UTF-8', $1 )
45
+ end
46
+ @referred_uri = referred_uri
47
+ @excerpt_limit = excerpt_limit
48
+ end
49
+
50
+ def convert( str ) # :nodoc:
51
+ @converter ? @converter.iconv( str ) : str
52
+ end
53
+
54
+ # Returns the clipping as a string suitable for use as XML text.
55
+ def to_s
56
+ if @contents =~ %r{<body[^>]*>(.*)</body>}mi
57
+ to_detokenize = $1
58
+ else
59
+ @contents =~ %r{<body[^>]*>(.*)}mi
60
+ to_detokenize = $1
61
+ end
62
+ excerpt = Detokenizer.new( to_detokenize, @referred_uri ).execute
63
+ excerpt = Clipper.new( excerpt, @excerpt_limit ).execute
64
+ excerpt.gsub( /[\200-\377]/ ) { |c| "&#%04d;" % c[0] }
65
+ convert( excerpt )
66
+ end
67
+
68
+ class ArrayOfWordsAndBrTags < Array # :nodoc:
69
+ def initialize( text )
70
+ super()
71
+ text.split( %r{(<br />)} ).each { |br_or_between_br|
72
+ if br_or_between_br == '<br />'
73
+ self << br_or_between_br
74
+ else
75
+ self.concat( br_or_between_br.split )
76
+ end
77
+ }
78
+ end
79
+ end
80
+
81
+ class Clipper # :nodoc:
82
+ def initialize( excerpt, excerpt_limit )
83
+ @excerpt = excerpt
84
+ @excerpt_limit = excerpt_limit
85
+ end
86
+
87
+ def add_next_word
88
+ if @words_before.empty?
89
+ add_from_words_after
90
+ elsif @words_after.empty?
91
+ add_from_words_before
92
+ elsif @str_after.size >= @str_before.size
93
+ add_from_words_before
94
+ else
95
+ add_from_words_after
96
+ end
97
+ end
98
+
99
+ def add_next_word?
100
+ ( excerpt_parts_under_limit &&
101
+ !( @words_before.empty? && @words_after.empty? ) )
102
+ end
103
+
104
+ def add_from_words_after
105
+ if @str_after == ''
106
+ @str_after = @words_after.shift
107
+ else
108
+ @str_after += ' ' + @words_after.shift
109
+ end
110
+ end
111
+
112
+ def add_from_words_before
113
+ if @str_before == ''
114
+ @str_before = @words_before.pop
115
+ else
116
+ @str_before = @words_before.pop + ' ' + @str_before
117
+ end
118
+ end
119
+
120
+ def excerpt_parts_under_limit
121
+ ( @str_before.size + @strong_text.size + @str_after.size + 2 ) <
122
+ @excerpt_limit
123
+ end
124
+
125
+ def execute
126
+ @excerpt =~ %r{(.*)(<strong>.*?</strong>\S*)(.*)}m
127
+ @text_before = $1
128
+ @strong_text = $2
129
+ @text_after = $3
130
+ get_elts.join( ' ' )
131
+ end
132
+
133
+ def get_elts
134
+ @words_before = ArrayOfWordsAndBrTags.new( @text_before )
135
+ @words_after = ArrayOfWordsAndBrTags.new( @text_after )
136
+ @str_before = ''
137
+ @str_after = ''
138
+ add_next_word while add_next_word?
139
+ ellipses = '&#8230;'
140
+ elts = []
141
+ elts << ellipses unless @words_before.empty?
142
+ elts << @str_before
143
+ elts << @strong_text
144
+ elts << @str_after
145
+ elts << ellipses unless @words_after.empty?
146
+ elts
147
+ end
148
+ end
149
+
150
+ class Detokenizer # :nodoc:
151
+ def initialize( contents, referred_uri )
152
+ @contents, @referred_uri = contents, referred_uri;
153
+ end
154
+
155
+ def compact_brs
156
+ result = 'something'
157
+ until result.nil?
158
+ result = @detokenized.gsub!( %r{<br />\s*<br />}, '<br /> ' )
159
+ end
160
+ end
161
+
162
+ def execute
163
+ @detokenized = HtmlWithFixedAttributes.new( @contents )
164
+ @detokenized.gsub!( /<!--.*?-->/m, '' )
165
+ @detokenized.gsub!(
166
+ %r{</?(h\d|p|blockquote|table|tr|br|div|form|ul|li|center|ol|dl|dd|dt|fieldset|option|select|object|o:p).*?>}i,
167
+ ' <br /> '
168
+ )
169
+ @detokenized.gsub!(
170
+ %r{</?(acronym|abbr|strong|td|tt|small|em|img|font|span|input|hr|noscript|legend|address).*?>}im, ''
171
+ )
172
+ @detokenized.gsub!( %r{</?(b|i)(\s+.*?)?>}i, '' )
173
+ substitute_links
174
+ compact_brs
175
+ @detokenized.gsub!( /\s+/, ' ' )
176
+ @detokenized
177
+ end
178
+
179
+ def substitute_links
180
+ @detokenized.gsub!( %r{<a\s+(.*?)>(.*?)</(xhtml:)?a.*?>}mi ) { |s|
181
+ a_att = $1
182
+ a_body = $2
183
+ if ( a_att =~ /href=('|")?([^'" ]*)("|')?/i )
184
+ href = $2
185
+ href.gsub!( /("|')$/, '' )
186
+ matches_self = ( href == @referred_uri )
187
+ else
188
+ matches_self = false
189
+ end
190
+ matches_self ? "<strong>#{ a_body }</strong>" : a_body
191
+ }
192
+ end
193
+ end
194
+
195
+ class HtmlWithFixedAttributes < String # :nodoc:
196
+ def initialize( contents )
197
+ super( '' )
198
+ script_stack = []
199
+ contents.split( %r{(</?script[^>]*>)}i ).each { |script_or_between|
200
+ if script_or_between =~ /^<script/i
201
+ script_stack.push( script_or_between )
202
+ elsif script_or_between =~ %r{</script}i
203
+ script_stack.pop
204
+ elsif script_stack.empty?
205
+ self << fix_attributes( script_or_between )
206
+ else
207
+ self << ' '
208
+ end
209
+ }
210
+ end
211
+
212
+ def fix_attributes( html_part )
213
+ fixed = ''
214
+ html_part.split( /(<.*?)(?=<)/ ).each { |part|
215
+ part = fix_bracket_plus( part ) if part != '' and part =~ /('|")/
216
+ fixed << part
217
+ }
218
+ fixed
219
+ end
220
+
221
+ def fix_bracket_plus( part )
222
+ inside_attribute = false
223
+ between_tags = true
224
+ last_attribute_quote = nil
225
+ fixed = ''
226
+ part.scan( /([^'"<>]*)("|'|<|>)/ ) { |pre_delimiter, delimiter|
227
+ delimiter.gsub!( />/, '&gt;' ) if inside_attribute
228
+ if %w( ' " ).include? delimiter and !between_tags
229
+ if last_attribute_quote == delimiter or !inside_attribute
230
+ inside_attribute = !inside_attribute
231
+ last_attribute_quote = delimiter
232
+ end
233
+ end
234
+ between_tags = !between_tags if %w( < > ).include? delimiter
235
+ fixed << pre_delimiter
236
+ fixed << delimiter
237
+ }
238
+ post_scan = ( part =~ /.*('|"|<|>)(.*)/m ) ? $2 : part
239
+ fixed << post_scan
240
+ fixed
241
+ end
242
+ end
243
+ end
@@ -0,0 +1,208 @@
1
+ require 'iconv'
2
+
3
+ class HtmlClipping
4
+ Version = '0.1.0'
5
+
6
+ def initialize( html, referred_uri, excerpt_limit )
7
+ @contents = html
8
+ if @contents =~ %r{<meta.*text/html; charset=(.*?)('|")}i
9
+ @converter = Iconv.new( 'UTF-8', $1 )
10
+ end
11
+ @referred_uri = referred_uri
12
+ @excerpt_limit = excerpt_limit
13
+ end
14
+
15
+ def convert( str )
16
+ @converter ? @converter.iconv( str ) : str
17
+ end
18
+
19
+ def to_s
20
+ if @contents =~ %r{<body[^>]*>(.*)</body>}mi
21
+ to_detokenize = $1
22
+ else
23
+ @contents =~ %r{<body[^>]*>(.*)}mi
24
+ to_detokenize = $1
25
+ end
26
+ excerpt = Detokenizer.new( to_detokenize, @referred_uri ).execute
27
+ excerpt = Clipper.new( excerpt, @excerpt_limit ).execute
28
+ excerpt.gsub( /[\200-\377]/ ) { |c| "&#%04d;" % c[0] }
29
+ convert( excerpt )
30
+ end
31
+
32
+
33
+ class ArrayOfWordsAndBrTags < Array
34
+ def initialize( text )
35
+ super()
36
+ text.split( %r{(<br />)} ).each { |br_or_between_br|
37
+ if br_or_between_br == '<br />'
38
+ self << br_or_between_br
39
+ else
40
+ self.concat( br_or_between_br.split )
41
+ end
42
+ }
43
+ end
44
+ end
45
+
46
+ class Clipper
47
+ def initialize( excerpt, excerpt_limit )
48
+ @excerpt = excerpt
49
+ @excerpt_limit = excerpt_limit
50
+ end
51
+
52
+ def add_next_word
53
+ if @words_before.empty?
54
+ add_from_words_after
55
+ elsif @words_after.empty?
56
+ add_from_words_before
57
+ elsif @str_after.size >= @str_before.size
58
+ add_from_words_before
59
+ else
60
+ add_from_words_after
61
+ end
62
+ end
63
+
64
+ def add_next_word?
65
+ ( excerpt_parts_under_limit &&
66
+ !( @words_before.empty? && @words_after.empty? ) )
67
+ end
68
+
69
+ def add_from_words_after
70
+ if @str_after == ''
71
+ @str_after = @words_after.shift
72
+ else
73
+ @str_after += ' ' + @words_after.shift
74
+ end
75
+ end
76
+
77
+ def add_from_words_before
78
+ if @str_before == ''
79
+ @str_before = @words_before.pop
80
+ else
81
+ @str_before = @words_before.pop + ' ' + @str_before
82
+ end
83
+ end
84
+
85
+ def excerpt_parts_under_limit
86
+ ( @str_before.size + @strong_text.size + @str_after.size + 2 ) <
87
+ @excerpt_limit
88
+ end
89
+
90
+ def execute
91
+ @excerpt =~ %r{(.*)(<strong>.*?</strong>\S*)(.*)}m
92
+ @text_before = $1
93
+ @strong_text = $2
94
+ @text_after = $3
95
+ get_elts.join( ' ' )
96
+ end
97
+
98
+ def get_elts
99
+ @words_before = ArrayOfWordsAndBrTags.new( @text_before )
100
+ @words_after = ArrayOfWordsAndBrTags.new( @text_after )
101
+ @str_before = ''
102
+ @str_after = ''
103
+ add_next_word while add_next_word?
104
+ ellipses = '&#8230;'
105
+ elts = []
106
+ elts << ellipses unless @words_before.empty?
107
+ elts << @str_before
108
+ elts << @strong_text
109
+ elts << @str_after
110
+ elts << ellipses unless @words_after.empty?
111
+ elts
112
+ end
113
+ end
114
+
115
+ class Detokenizer
116
+ def initialize( contents, referred_uri )
117
+ @contents, @referred_uri = contents, referred_uri;
118
+ end
119
+
120
+ def compact_brs
121
+ result = 'something'
122
+ until result.nil?
123
+ result = @detokenized.gsub!( %r{<br />\s*<br />}, '<br /> ' )
124
+ end
125
+ end
126
+
127
+ def execute
128
+ @detokenized = HtmlWithFixedAttributes.new( @contents )
129
+ @detokenized.gsub!( /<!--.*?-->/m, '' )
130
+ @detokenized.gsub!(
131
+ %r{</?(h\d|p|blockquote|table|tr|br|div|form|ul|li|center|ol|dl|dd|dt|fieldset|option|select|object|o:p).*?>}i,
132
+ ' <br /> '
133
+ )
134
+ @detokenized.gsub!(
135
+ %r{</?(acronym|abbr|strong|td|tt|small|em|img|font|span|input|hr|noscript|legend|address).*?>}im, ''
136
+ )
137
+ @detokenized.gsub!( %r{</?(b|i)(\s+.*?)?>}i, '' )
138
+ substitute_links
139
+ compact_brs
140
+ @detokenized.gsub!( /\s+/, ' ' )
141
+ @detokenized
142
+ end
143
+
144
+ def substitute_links
145
+ @detokenized.gsub!( %r{<a\s+(.*?)>(.*?)</(xhtml:)?a.*?>}mi ) { |s|
146
+ a_att = $1
147
+ a_body = $2
148
+ if ( a_att =~ /href=('|")?([^'" ]*)("|')?/i )
149
+ href = $2
150
+ href.gsub!( /("|')$/, '' )
151
+ matches_self = ( href == @referred_uri )
152
+ else
153
+ matches_self = false
154
+ end
155
+ matches_self ? "<strong>#{ a_body }</strong>" : a_body
156
+ }
157
+ end
158
+ end
159
+
160
+ class HtmlWithFixedAttributes < String
161
+ def initialize( contents )
162
+ super( '' )
163
+ script_stack = []
164
+ contents.split( %r{(</?script[^>]*>)}i ).each { |script_or_between|
165
+ if script_or_between =~ /^<script/i
166
+ script_stack.push( script_or_between )
167
+ elsif script_or_between =~ %r{</script}i
168
+ script_stack.pop
169
+ elsif script_stack.empty?
170
+ self << fix_attributes( script_or_between )
171
+ else
172
+ self << ' '
173
+ end
174
+ }
175
+ end
176
+
177
+ def fix_attributes( html_part )
178
+ fixed = ''
179
+ html_part.split( /(<.*?)(?=<)/ ).each { |part|
180
+ part = fix_bracket_plus( part ) if part != '' and part =~ /('|")/
181
+ fixed << part
182
+ }
183
+ fixed
184
+ end
185
+
186
+ def fix_bracket_plus( part )
187
+ inside_attribute = false
188
+ between_tags = true
189
+ last_attribute_quote = nil
190
+ fixed = ''
191
+ part.scan( /([^'"<>]*)("|'|<|>)/ ) { |pre_delimiter, delimiter|
192
+ delimiter.gsub!( />/, '&gt;' ) if inside_attribute
193
+ if %w( ' " ).include? delimiter and !between_tags
194
+ if last_attribute_quote == delimiter or !inside_attribute
195
+ inside_attribute = !inside_attribute
196
+ last_attribute_quote = delimiter
197
+ end
198
+ end
199
+ between_tags = !between_tags if %w( < > ).include? delimiter
200
+ fixed << pre_delimiter
201
+ fixed << delimiter
202
+ }
203
+ post_scan = ( part =~ /.*('|"|<|>)(.*)/m ) ? $2 : part
204
+ fixed << post_scan
205
+ fixed
206
+ end
207
+ end
208
+ end
metadata ADDED
@@ -0,0 +1,41 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.6
3
+ specification_version: 1
4
+ name: htmlclipping
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2005-05-15
8
+ summary: HtmlClipping generates excerpts from an HTML page that has a link pointing to a particular URI.
9
+ require_paths:
10
+ - lib
11
+ email: sera@fhwang.net
12
+ homepage: http://htmlclipping.rubyforge.org/
13
+ rubyforge_project:
14
+ description: "HtmlClipping generates excerpts from an HTML page that has a link pointing to a
15
+ particular URI. It removes most HTML markup, bolds the link text, and trims the
16
+ resulting text to a fixed number of characters. I developed it to help me track
17
+ referers to my website, though I suppose it might have other uses."
18
+ autorequire: htmlclipping
19
+ default_executable:
20
+ bindir: bin
21
+ has_rdoc: false
22
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
23
+ requirements:
24
+ -
25
+ - ">"
26
+ - !ruby/object:Gem::Version
27
+ version: 0.0.0
28
+ version:
29
+ platform: ruby
30
+ authors:
31
+ - Francis Hwang
32
+ files:
33
+ - lib/htmlclipping.rb
34
+ - lib/htmlclipping.rb~
35
+ test_files: []
36
+ rdoc_options: []
37
+ extra_rdoc_files: []
38
+ executables: []
39
+ extensions: []
40
+ requirements: []
41
+ dependencies: []