htmlclipping 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/htmlclipping.rb +243 -0
  2. data/lib/htmlclipping.rb~ +208 -0
  3. metadata +41 -0
@@ -0,0 +1,243 @@
1
+ # HtmlClipping generates excerpts from an HTML page that has a link pointing to
2
+ # a particular URI. It removes most HTML markup, bolds the link text, and
3
+ # trims the resulting text to a fixed number of characters. I developed it to
4
+ # help me track referers to my website, though I suppose it might have other
5
+ # uses.
6
+ #
7
+ # For example, the following script gets the HTML at http://rubyforge.org/credits/, and forms an excerpt around the link to http://www.rubycentral.org/pledge/.
8
+ #
9
+ # require 'htmlclipping'
10
+ # require 'net/http'
11
+ #
12
+ # contents = ''
13
+ # Net::HTTP.start( 'rubyforge.org' ) do |http|
14
+ # response = http.get '/credits/'
15
+ # contents = response.body
16
+ # end
17
+ # clipping = HtmlClipping.new(
18
+ # contents, 'http://www.rubycentral.org/pledge/', 500
19
+ # )
20
+ # puts clipping.to_s
21
+ #
22
+ # => "… RubyForge takes time, effort, and money. Many thanks to the
23
+ # folks listed below who are making it possible! <br /> If RubyForge has
24
+ # been helpful to you, and you want to give something back to the Ruby
25
+ # community, please consider supporting <strong>RubyCentral</strong>.
26
+ # Thanks! <br /> InfoEther, Inc purchased the RubyForge hardware and
27
+ # provides system administration support. <br /> Several folks provide
28
+ # file mirrors to help share the bandwidth load: <br /> Evan Webb <br />
29
+ # Dennis Oelkers <br /> Austin &#8230;"
30
+ #
31
+ # The Rubyforge project page can be found at http://rubyforge.org/projects/htmlclipping.
32
+
33
+ require 'iconv'
34
+
35
+ class HtmlClipping
36
+ Version = '0.1.0'
37
+
38
+ # html:: The HTML of the referring web page.
39
+ # referred_uri:: The URI that is being referred to.
40
+ # excerpt_limit:: The maximum size of the resulting clipping
41
+ def initialize( html, referred_uri, excerpt_limit )
42
+ @contents = html
43
+ if @contents =~ %r{<meta.*text/html; charset=(.*?)('|")}i
44
+ @converter = Iconv.new( 'UTF-8', $1 )
45
+ end
46
+ @referred_uri = referred_uri
47
+ @excerpt_limit = excerpt_limit
48
+ end
49
+
50
+ def convert( str ) # :nodoc:
51
+ @converter ? @converter.iconv( str ) : str
52
+ end
53
+
54
+ # Returns the clipping as a string suitable for use as XML text.
55
+ def to_s
56
+ if @contents =~ %r{<body[^>]*>(.*)</body>}mi
57
+ to_detokenize = $1
58
+ else
59
+ @contents =~ %r{<body[^>]*>(.*)}mi
60
+ to_detokenize = $1
61
+ end
62
+ excerpt = Detokenizer.new( to_detokenize, @referred_uri ).execute
63
+ excerpt = Clipper.new( excerpt, @excerpt_limit ).execute
64
+ excerpt.gsub( /[\200-\377]/ ) { |c| "&#%04d;" % c[0] }
65
+ convert( excerpt )
66
+ end
67
+
68
+ class ArrayOfWordsAndBrTags < Array # :nodoc:
69
+ def initialize( text )
70
+ super()
71
+ text.split( %r{(<br />)} ).each { |br_or_between_br|
72
+ if br_or_between_br == '<br />'
73
+ self << br_or_between_br
74
+ else
75
+ self.concat( br_or_between_br.split )
76
+ end
77
+ }
78
+ end
79
+ end
80
+
81
+ class Clipper # :nodoc:
82
+ def initialize( excerpt, excerpt_limit )
83
+ @excerpt = excerpt
84
+ @excerpt_limit = excerpt_limit
85
+ end
86
+
87
+ def add_next_word
88
+ if @words_before.empty?
89
+ add_from_words_after
90
+ elsif @words_after.empty?
91
+ add_from_words_before
92
+ elsif @str_after.size >= @str_before.size
93
+ add_from_words_before
94
+ else
95
+ add_from_words_after
96
+ end
97
+ end
98
+
99
+ def add_next_word?
100
+ ( excerpt_parts_under_limit &&
101
+ !( @words_before.empty? && @words_after.empty? ) )
102
+ end
103
+
104
+ def add_from_words_after
105
+ if @str_after == ''
106
+ @str_after = @words_after.shift
107
+ else
108
+ @str_after += ' ' + @words_after.shift
109
+ end
110
+ end
111
+
112
+ def add_from_words_before
113
+ if @str_before == ''
114
+ @str_before = @words_before.pop
115
+ else
116
+ @str_before = @words_before.pop + ' ' + @str_before
117
+ end
118
+ end
119
+
120
+ def excerpt_parts_under_limit
121
+ ( @str_before.size + @strong_text.size + @str_after.size + 2 ) <
122
+ @excerpt_limit
123
+ end
124
+
125
+ def execute
126
+ @excerpt =~ %r{(.*)(<strong>.*?</strong>\S*)(.*)}m
127
+ @text_before = $1
128
+ @strong_text = $2
129
+ @text_after = $3
130
+ get_elts.join( ' ' )
131
+ end
132
+
133
+ def get_elts
134
+ @words_before = ArrayOfWordsAndBrTags.new( @text_before )
135
+ @words_after = ArrayOfWordsAndBrTags.new( @text_after )
136
+ @str_before = ''
137
+ @str_after = ''
138
+ add_next_word while add_next_word?
139
+ ellipses = '&#8230;'
140
+ elts = []
141
+ elts << ellipses unless @words_before.empty?
142
+ elts << @str_before
143
+ elts << @strong_text
144
+ elts << @str_after
145
+ elts << ellipses unless @words_after.empty?
146
+ elts
147
+ end
148
+ end
149
+
150
+ class Detokenizer # :nodoc:
151
+ def initialize( contents, referred_uri )
152
+ @contents, @referred_uri = contents, referred_uri;
153
+ end
154
+
155
+ def compact_brs
156
+ result = 'something'
157
+ until result.nil?
158
+ result = @detokenized.gsub!( %r{<br />\s*<br />}, '<br /> ' )
159
+ end
160
+ end
161
+
162
+ def execute
163
+ @detokenized = HtmlWithFixedAttributes.new( @contents )
164
+ @detokenized.gsub!( /<!--.*?-->/m, '' )
165
+ @detokenized.gsub!(
166
+ %r{</?(h\d|p|blockquote|table|tr|br|div|form|ul|li|center|ol|dl|dd|dt|fieldset|option|select|object|o:p).*?>}i,
167
+ ' <br /> '
168
+ )
169
+ @detokenized.gsub!(
170
+ %r{</?(acronym|abbr|strong|td|tt|small|em|img|font|span|input|hr|noscript|legend|address).*?>}im, ''
171
+ )
172
+ @detokenized.gsub!( %r{</?(b|i)(\s+.*?)?>}i, '' )
173
+ substitute_links
174
+ compact_brs
175
+ @detokenized.gsub!( /\s+/, ' ' )
176
+ @detokenized
177
+ end
178
+
179
+ def substitute_links
180
+ @detokenized.gsub!( %r{<a\s+(.*?)>(.*?)</(xhtml:)?a.*?>}mi ) { |s|
181
+ a_att = $1
182
+ a_body = $2
183
+ if ( a_att =~ /href=('|")?([^'" ]*)("|')?/i )
184
+ href = $2
185
+ href.gsub!( /("|')$/, '' )
186
+ matches_self = ( href == @referred_uri )
187
+ else
188
+ matches_self = false
189
+ end
190
+ matches_self ? "<strong>#{ a_body }</strong>" : a_body
191
+ }
192
+ end
193
+ end
194
+
195
+ class HtmlWithFixedAttributes < String # :nodoc:
196
+ def initialize( contents )
197
+ super( '' )
198
+ script_stack = []
199
+ contents.split( %r{(</?script[^>]*>)}i ).each { |script_or_between|
200
+ if script_or_between =~ /^<script/i
201
+ script_stack.push( script_or_between )
202
+ elsif script_or_between =~ %r{</script}i
203
+ script_stack.pop
204
+ elsif script_stack.empty?
205
+ self << fix_attributes( script_or_between )
206
+ else
207
+ self << ' '
208
+ end
209
+ }
210
+ end
211
+
212
+ def fix_attributes( html_part )
213
+ fixed = ''
214
+ html_part.split( /(<.*?)(?=<)/ ).each { |part|
215
+ part = fix_bracket_plus( part ) if part != '' and part =~ /('|")/
216
+ fixed << part
217
+ }
218
+ fixed
219
+ end
220
+
221
+ def fix_bracket_plus( part )
222
+ inside_attribute = false
223
+ between_tags = true
224
+ last_attribute_quote = nil
225
+ fixed = ''
226
+ part.scan( /([^'"<>]*)("|'|<|>)/ ) { |pre_delimiter, delimiter|
227
+ delimiter.gsub!( />/, '&gt;' ) if inside_attribute
228
+ if %w( ' " ).include? delimiter and !between_tags
229
+ if last_attribute_quote == delimiter or !inside_attribute
230
+ inside_attribute = !inside_attribute
231
+ last_attribute_quote = delimiter
232
+ end
233
+ end
234
+ between_tags = !between_tags if %w( < > ).include? delimiter
235
+ fixed << pre_delimiter
236
+ fixed << delimiter
237
+ }
238
+ post_scan = ( part =~ /.*('|"|<|>)(.*)/m ) ? $2 : part
239
+ fixed << post_scan
240
+ fixed
241
+ end
242
+ end
243
+ end
@@ -0,0 +1,208 @@
1
+ require 'iconv'
2
+
3
+ class HtmlClipping
4
+ Version = '0.1.0'
5
+
6
+ def initialize( html, referred_uri, excerpt_limit )
7
+ @contents = html
8
+ if @contents =~ %r{<meta.*text/html; charset=(.*?)('|")}i
9
+ @converter = Iconv.new( 'UTF-8', $1 )
10
+ end
11
+ @referred_uri = referred_uri
12
+ @excerpt_limit = excerpt_limit
13
+ end
14
+
15
+ def convert( str )
16
+ @converter ? @converter.iconv( str ) : str
17
+ end
18
+
19
+ def to_s
20
+ if @contents =~ %r{<body[^>]*>(.*)</body>}mi
21
+ to_detokenize = $1
22
+ else
23
+ @contents =~ %r{<body[^>]*>(.*)}mi
24
+ to_detokenize = $1
25
+ end
26
+ excerpt = Detokenizer.new( to_detokenize, @referred_uri ).execute
27
+ excerpt = Clipper.new( excerpt, @excerpt_limit ).execute
28
+ excerpt.gsub( /[\200-\377]/ ) { |c| "&#%04d;" % c[0] }
29
+ convert( excerpt )
30
+ end
31
+
32
+
33
+ class ArrayOfWordsAndBrTags < Array
34
+ def initialize( text )
35
+ super()
36
+ text.split( %r{(<br />)} ).each { |br_or_between_br|
37
+ if br_or_between_br == '<br />'
38
+ self << br_or_between_br
39
+ else
40
+ self.concat( br_or_between_br.split )
41
+ end
42
+ }
43
+ end
44
+ end
45
+
46
+ class Clipper
47
+ def initialize( excerpt, excerpt_limit )
48
+ @excerpt = excerpt
49
+ @excerpt_limit = excerpt_limit
50
+ end
51
+
52
+ def add_next_word
53
+ if @words_before.empty?
54
+ add_from_words_after
55
+ elsif @words_after.empty?
56
+ add_from_words_before
57
+ elsif @str_after.size >= @str_before.size
58
+ add_from_words_before
59
+ else
60
+ add_from_words_after
61
+ end
62
+ end
63
+
64
+ def add_next_word?
65
+ ( excerpt_parts_under_limit &&
66
+ !( @words_before.empty? && @words_after.empty? ) )
67
+ end
68
+
69
+ def add_from_words_after
70
+ if @str_after == ''
71
+ @str_after = @words_after.shift
72
+ else
73
+ @str_after += ' ' + @words_after.shift
74
+ end
75
+ end
76
+
77
+ def add_from_words_before
78
+ if @str_before == ''
79
+ @str_before = @words_before.pop
80
+ else
81
+ @str_before = @words_before.pop + ' ' + @str_before
82
+ end
83
+ end
84
+
85
+ def excerpt_parts_under_limit
86
+ ( @str_before.size + @strong_text.size + @str_after.size + 2 ) <
87
+ @excerpt_limit
88
+ end
89
+
90
+ def execute
91
+ @excerpt =~ %r{(.*)(<strong>.*?</strong>\S*)(.*)}m
92
+ @text_before = $1
93
+ @strong_text = $2
94
+ @text_after = $3
95
+ get_elts.join( ' ' )
96
+ end
97
+
98
+ def get_elts
99
+ @words_before = ArrayOfWordsAndBrTags.new( @text_before )
100
+ @words_after = ArrayOfWordsAndBrTags.new( @text_after )
101
+ @str_before = ''
102
+ @str_after = ''
103
+ add_next_word while add_next_word?
104
+ ellipses = '&#8230;'
105
+ elts = []
106
+ elts << ellipses unless @words_before.empty?
107
+ elts << @str_before
108
+ elts << @strong_text
109
+ elts << @str_after
110
+ elts << ellipses unless @words_after.empty?
111
+ elts
112
+ end
113
+ end
114
+
115
+ class Detokenizer
116
+ def initialize( contents, referred_uri )
117
+ @contents, @referred_uri = contents, referred_uri;
118
+ end
119
+
120
+ def compact_brs
121
+ result = 'something'
122
+ until result.nil?
123
+ result = @detokenized.gsub!( %r{<br />\s*<br />}, '<br /> ' )
124
+ end
125
+ end
126
+
127
+ def execute
128
+ @detokenized = HtmlWithFixedAttributes.new( @contents )
129
+ @detokenized.gsub!( /<!--.*?-->/m, '' )
130
+ @detokenized.gsub!(
131
+ %r{</?(h\d|p|blockquote|table|tr|br|div|form|ul|li|center|ol|dl|dd|dt|fieldset|option|select|object|o:p).*?>}i,
132
+ ' <br /> '
133
+ )
134
+ @detokenized.gsub!(
135
+ %r{</?(acronym|abbr|strong|td|tt|small|em|img|font|span|input|hr|noscript|legend|address).*?>}im, ''
136
+ )
137
+ @detokenized.gsub!( %r{</?(b|i)(\s+.*?)?>}i, '' )
138
+ substitute_links
139
+ compact_brs
140
+ @detokenized.gsub!( /\s+/, ' ' )
141
+ @detokenized
142
+ end
143
+
144
+ def substitute_links
145
+ @detokenized.gsub!( %r{<a\s+(.*?)>(.*?)</(xhtml:)?a.*?>}mi ) { |s|
146
+ a_att = $1
147
+ a_body = $2
148
+ if ( a_att =~ /href=('|")?([^'" ]*)("|')?/i )
149
+ href = $2
150
+ href.gsub!( /("|')$/, '' )
151
+ matches_self = ( href == @referred_uri )
152
+ else
153
+ matches_self = false
154
+ end
155
+ matches_self ? "<strong>#{ a_body }</strong>" : a_body
156
+ }
157
+ end
158
+ end
159
+
160
+ class HtmlWithFixedAttributes < String
161
+ def initialize( contents )
162
+ super( '' )
163
+ script_stack = []
164
+ contents.split( %r{(</?script[^>]*>)}i ).each { |script_or_between|
165
+ if script_or_between =~ /^<script/i
166
+ script_stack.push( script_or_between )
167
+ elsif script_or_between =~ %r{</script}i
168
+ script_stack.pop
169
+ elsif script_stack.empty?
170
+ self << fix_attributes( script_or_between )
171
+ else
172
+ self << ' '
173
+ end
174
+ }
175
+ end
176
+
177
+ def fix_attributes( html_part )
178
+ fixed = ''
179
+ html_part.split( /(<.*?)(?=<)/ ).each { |part|
180
+ part = fix_bracket_plus( part ) if part != '' and part =~ /('|")/
181
+ fixed << part
182
+ }
183
+ fixed
184
+ end
185
+
186
+ def fix_bracket_plus( part )
187
+ inside_attribute = false
188
+ between_tags = true
189
+ last_attribute_quote = nil
190
+ fixed = ''
191
+ part.scan( /([^'"<>]*)("|'|<|>)/ ) { |pre_delimiter, delimiter|
192
+ delimiter.gsub!( />/, '&gt;' ) if inside_attribute
193
+ if %w( ' " ).include? delimiter and !between_tags
194
+ if last_attribute_quote == delimiter or !inside_attribute
195
+ inside_attribute = !inside_attribute
196
+ last_attribute_quote = delimiter
197
+ end
198
+ end
199
+ between_tags = !between_tags if %w( < > ).include? delimiter
200
+ fixed << pre_delimiter
201
+ fixed << delimiter
202
+ }
203
+ post_scan = ( part =~ /.*('|"|<|>)(.*)/m ) ? $2 : part
204
+ fixed << post_scan
205
+ fixed
206
+ end
207
+ end
208
+ end
metadata ADDED
@@ -0,0 +1,41 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.6
3
+ specification_version: 1
4
+ name: htmlclipping
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2005-05-15
8
+ summary: HtmlClipping generates excerpts from an HTML page that has a link pointing to a particular URI.
9
+ require_paths:
10
+ - lib
11
+ email: sera@fhwang.net
12
+ homepage: http://htmlclipping.rubyforge.org/
13
+ rubyforge_project:
14
+ description: "HtmlClipping generates excerpts from an HTML page that has a link pointing to a
15
+ particular URI. It removes most HTML markup, bolds the link text, and trims the
16
+ resulting text to a fixed number of characters. I developed it to help me track
17
+ referers to my website, though I suppose it might have other uses."
18
+ autorequire: htmlclipping
19
+ default_executable:
20
+ bindir: bin
21
+ has_rdoc: false
22
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
23
+ requirements:
24
+ -
25
+ - ">"
26
+ - !ruby/object:Gem::Version
27
+ version: 0.0.0
28
+ version:
29
+ platform: ruby
30
+ authors:
31
+ - Francis Hwang
32
+ files:
33
+ - lib/htmlclipping.rb
34
+ - lib/htmlclipping.rb~
35
+ test_files: []
36
+ rdoc_options: []
37
+ extra_rdoc_files: []
38
+ executables: []
39
+ extensions: []
40
+ requirements: []
41
+ dependencies: []