htmlclipping 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/htmlclipping.rb +243 -0
- data/lib/htmlclipping.rb~ +208 -0
- metadata +41 -0
data/lib/htmlclipping.rb
ADDED
@@ -0,0 +1,243 @@
|
|
1
|
+
# HtmlClipping generates excerpts from an HTML page that has a link pointing to
|
2
|
+
# a particular URI. It removes most HTML markup, bolds the link text, and
|
3
|
+
# trims the resulting text to a fixed number of characters. I developed it to
|
4
|
+
# help me track referers to my website, though I suppose it might have other
|
5
|
+
# uses.
|
6
|
+
#
|
7
|
+
# For example, the following script gets the HTML at http://rubyforge.org/credits/, and forms an excerpt around the link to http://www.rubycentral.org/pledge/.
|
8
|
+
#
|
9
|
+
# require 'htmlclipping'
|
10
|
+
# require 'net/http'
|
11
|
+
#
|
12
|
+
# contents = ''
|
13
|
+
# Net::HTTP.start( 'rubyforge.org' ) do |http|
|
14
|
+
# response = http.get '/credits/'
|
15
|
+
# contents = response.body
|
16
|
+
# end
|
17
|
+
# clipping = HtmlClipping.new(
|
18
|
+
# contents, 'http://www.rubycentral.org/pledge/', 500
|
19
|
+
# )
|
20
|
+
# puts clipping.to_s
|
21
|
+
#
|
22
|
+
# => "… RubyForge takes time, effort, and money. Many thanks to the
|
23
|
+
# folks listed below who are making it possible! <br /> If RubyForge has
|
24
|
+
# been helpful to you, and you want to give something back to the Ruby
|
25
|
+
# community, please consider supporting <strong>RubyCentral</strong>.
|
26
|
+
# Thanks! <br /> InfoEther, Inc purchased the RubyForge hardware and
|
27
|
+
# provides system administration support. <br /> Several folks provide
|
28
|
+
# file mirrors to help share the bandwidth load: <br /> Evan Webb <br />
|
29
|
+
# Dennis Oelkers <br /> Austin …"
|
30
|
+
#
|
31
|
+
# The Rubyforge project page can be found at http://rubyforge.org/projects/htmlclipping.
|
32
|
+
|
33
|
+
require 'iconv'
|
34
|
+
|
35
|
+
class HtmlClipping
|
36
|
+
Version = '0.1.0'
|
37
|
+
|
38
|
+
# html:: The HTML of the referring web page.
|
39
|
+
# referred_uri:: The URI that is being referred to.
|
40
|
+
# excerpt_limit:: The maximum size of the resulting clipping
|
41
|
+
def initialize( html, referred_uri, excerpt_limit )
|
42
|
+
@contents = html
|
43
|
+
if @contents =~ %r{<meta.*text/html; charset=(.*?)('|")}i
|
44
|
+
@converter = Iconv.new( 'UTF-8', $1 )
|
45
|
+
end
|
46
|
+
@referred_uri = referred_uri
|
47
|
+
@excerpt_limit = excerpt_limit
|
48
|
+
end
|
49
|
+
|
50
|
+
def convert( str ) # :nodoc:
|
51
|
+
@converter ? @converter.iconv( str ) : str
|
52
|
+
end
|
53
|
+
|
54
|
+
# Returns the clipping as a string suitable for use as XML text.
|
55
|
+
def to_s
|
56
|
+
if @contents =~ %r{<body[^>]*>(.*)</body>}mi
|
57
|
+
to_detokenize = $1
|
58
|
+
else
|
59
|
+
@contents =~ %r{<body[^>]*>(.*)}mi
|
60
|
+
to_detokenize = $1
|
61
|
+
end
|
62
|
+
excerpt = Detokenizer.new( to_detokenize, @referred_uri ).execute
|
63
|
+
excerpt = Clipper.new( excerpt, @excerpt_limit ).execute
|
64
|
+
excerpt.gsub( /[\200-\377]/ ) { |c| "&#%04d;" % c[0] }
|
65
|
+
convert( excerpt )
|
66
|
+
end
|
67
|
+
|
68
|
+
class ArrayOfWordsAndBrTags < Array # :nodoc:
|
69
|
+
def initialize( text )
|
70
|
+
super()
|
71
|
+
text.split( %r{(<br />)} ).each { |br_or_between_br|
|
72
|
+
if br_or_between_br == '<br />'
|
73
|
+
self << br_or_between_br
|
74
|
+
else
|
75
|
+
self.concat( br_or_between_br.split )
|
76
|
+
end
|
77
|
+
}
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
class Clipper # :nodoc:
|
82
|
+
def initialize( excerpt, excerpt_limit )
|
83
|
+
@excerpt = excerpt
|
84
|
+
@excerpt_limit = excerpt_limit
|
85
|
+
end
|
86
|
+
|
87
|
+
def add_next_word
|
88
|
+
if @words_before.empty?
|
89
|
+
add_from_words_after
|
90
|
+
elsif @words_after.empty?
|
91
|
+
add_from_words_before
|
92
|
+
elsif @str_after.size >= @str_before.size
|
93
|
+
add_from_words_before
|
94
|
+
else
|
95
|
+
add_from_words_after
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def add_next_word?
|
100
|
+
( excerpt_parts_under_limit &&
|
101
|
+
!( @words_before.empty? && @words_after.empty? ) )
|
102
|
+
end
|
103
|
+
|
104
|
+
def add_from_words_after
|
105
|
+
if @str_after == ''
|
106
|
+
@str_after = @words_after.shift
|
107
|
+
else
|
108
|
+
@str_after += ' ' + @words_after.shift
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def add_from_words_before
|
113
|
+
if @str_before == ''
|
114
|
+
@str_before = @words_before.pop
|
115
|
+
else
|
116
|
+
@str_before = @words_before.pop + ' ' + @str_before
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def excerpt_parts_under_limit
|
121
|
+
( @str_before.size + @strong_text.size + @str_after.size + 2 ) <
|
122
|
+
@excerpt_limit
|
123
|
+
end
|
124
|
+
|
125
|
+
def execute
|
126
|
+
@excerpt =~ %r{(.*)(<strong>.*?</strong>\S*)(.*)}m
|
127
|
+
@text_before = $1
|
128
|
+
@strong_text = $2
|
129
|
+
@text_after = $3
|
130
|
+
get_elts.join( ' ' )
|
131
|
+
end
|
132
|
+
|
133
|
+
def get_elts
|
134
|
+
@words_before = ArrayOfWordsAndBrTags.new( @text_before )
|
135
|
+
@words_after = ArrayOfWordsAndBrTags.new( @text_after )
|
136
|
+
@str_before = ''
|
137
|
+
@str_after = ''
|
138
|
+
add_next_word while add_next_word?
|
139
|
+
ellipses = '…'
|
140
|
+
elts = []
|
141
|
+
elts << ellipses unless @words_before.empty?
|
142
|
+
elts << @str_before
|
143
|
+
elts << @strong_text
|
144
|
+
elts << @str_after
|
145
|
+
elts << ellipses unless @words_after.empty?
|
146
|
+
elts
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
class Detokenizer # :nodoc:
|
151
|
+
def initialize( contents, referred_uri )
|
152
|
+
@contents, @referred_uri = contents, referred_uri;
|
153
|
+
end
|
154
|
+
|
155
|
+
def compact_brs
|
156
|
+
result = 'something'
|
157
|
+
until result.nil?
|
158
|
+
result = @detokenized.gsub!( %r{<br />\s*<br />}, '<br /> ' )
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def execute
|
163
|
+
@detokenized = HtmlWithFixedAttributes.new( @contents )
|
164
|
+
@detokenized.gsub!( /<!--.*?-->/m, '' )
|
165
|
+
@detokenized.gsub!(
|
166
|
+
%r{</?(h\d|p|blockquote|table|tr|br|div|form|ul|li|center|ol|dl|dd|dt|fieldset|option|select|object|o:p).*?>}i,
|
167
|
+
' <br /> '
|
168
|
+
)
|
169
|
+
@detokenized.gsub!(
|
170
|
+
%r{</?(acronym|abbr|strong|td|tt|small|em|img|font|span|input|hr|noscript|legend|address).*?>}im, ''
|
171
|
+
)
|
172
|
+
@detokenized.gsub!( %r{</?(b|i)(\s+.*?)?>}i, '' )
|
173
|
+
substitute_links
|
174
|
+
compact_brs
|
175
|
+
@detokenized.gsub!( /\s+/, ' ' )
|
176
|
+
@detokenized
|
177
|
+
end
|
178
|
+
|
179
|
+
def substitute_links
|
180
|
+
@detokenized.gsub!( %r{<a\s+(.*?)>(.*?)</(xhtml:)?a.*?>}mi ) { |s|
|
181
|
+
a_att = $1
|
182
|
+
a_body = $2
|
183
|
+
if ( a_att =~ /href=('|")?([^'" ]*)("|')?/i )
|
184
|
+
href = $2
|
185
|
+
href.gsub!( /("|')$/, '' )
|
186
|
+
matches_self = ( href == @referred_uri )
|
187
|
+
else
|
188
|
+
matches_self = false
|
189
|
+
end
|
190
|
+
matches_self ? "<strong>#{ a_body }</strong>" : a_body
|
191
|
+
}
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
class HtmlWithFixedAttributes < String # :nodoc:
|
196
|
+
def initialize( contents )
|
197
|
+
super( '' )
|
198
|
+
script_stack = []
|
199
|
+
contents.split( %r{(</?script[^>]*>)}i ).each { |script_or_between|
|
200
|
+
if script_or_between =~ /^<script/i
|
201
|
+
script_stack.push( script_or_between )
|
202
|
+
elsif script_or_between =~ %r{</script}i
|
203
|
+
script_stack.pop
|
204
|
+
elsif script_stack.empty?
|
205
|
+
self << fix_attributes( script_or_between )
|
206
|
+
else
|
207
|
+
self << ' '
|
208
|
+
end
|
209
|
+
}
|
210
|
+
end
|
211
|
+
|
212
|
+
def fix_attributes( html_part )
|
213
|
+
fixed = ''
|
214
|
+
html_part.split( /(<.*?)(?=<)/ ).each { |part|
|
215
|
+
part = fix_bracket_plus( part ) if part != '' and part =~ /('|")/
|
216
|
+
fixed << part
|
217
|
+
}
|
218
|
+
fixed
|
219
|
+
end
|
220
|
+
|
221
|
+
def fix_bracket_plus( part )
|
222
|
+
inside_attribute = false
|
223
|
+
between_tags = true
|
224
|
+
last_attribute_quote = nil
|
225
|
+
fixed = ''
|
226
|
+
part.scan( /([^'"<>]*)("|'|<|>)/ ) { |pre_delimiter, delimiter|
|
227
|
+
delimiter.gsub!( />/, '>' ) if inside_attribute
|
228
|
+
if %w( ' " ).include? delimiter and !between_tags
|
229
|
+
if last_attribute_quote == delimiter or !inside_attribute
|
230
|
+
inside_attribute = !inside_attribute
|
231
|
+
last_attribute_quote = delimiter
|
232
|
+
end
|
233
|
+
end
|
234
|
+
between_tags = !between_tags if %w( < > ).include? delimiter
|
235
|
+
fixed << pre_delimiter
|
236
|
+
fixed << delimiter
|
237
|
+
}
|
238
|
+
post_scan = ( part =~ /.*('|"|<|>)(.*)/m ) ? $2 : part
|
239
|
+
fixed << post_scan
|
240
|
+
fixed
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
@@ -0,0 +1,208 @@
|
|
1
|
+
require 'iconv'
|
2
|
+
|
3
|
+
class HtmlClipping
|
4
|
+
Version = '0.1.0'
|
5
|
+
|
6
|
+
def initialize( html, referred_uri, excerpt_limit )
|
7
|
+
@contents = html
|
8
|
+
if @contents =~ %r{<meta.*text/html; charset=(.*?)('|")}i
|
9
|
+
@converter = Iconv.new( 'UTF-8', $1 )
|
10
|
+
end
|
11
|
+
@referred_uri = referred_uri
|
12
|
+
@excerpt_limit = excerpt_limit
|
13
|
+
end
|
14
|
+
|
15
|
+
def convert( str )
|
16
|
+
@converter ? @converter.iconv( str ) : str
|
17
|
+
end
|
18
|
+
|
19
|
+
def to_s
|
20
|
+
if @contents =~ %r{<body[^>]*>(.*)</body>}mi
|
21
|
+
to_detokenize = $1
|
22
|
+
else
|
23
|
+
@contents =~ %r{<body[^>]*>(.*)}mi
|
24
|
+
to_detokenize = $1
|
25
|
+
end
|
26
|
+
excerpt = Detokenizer.new( to_detokenize, @referred_uri ).execute
|
27
|
+
excerpt = Clipper.new( excerpt, @excerpt_limit ).execute
|
28
|
+
excerpt.gsub( /[\200-\377]/ ) { |c| "&#%04d;" % c[0] }
|
29
|
+
convert( excerpt )
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
class ArrayOfWordsAndBrTags < Array
|
34
|
+
def initialize( text )
|
35
|
+
super()
|
36
|
+
text.split( %r{(<br />)} ).each { |br_or_between_br|
|
37
|
+
if br_or_between_br == '<br />'
|
38
|
+
self << br_or_between_br
|
39
|
+
else
|
40
|
+
self.concat( br_or_between_br.split )
|
41
|
+
end
|
42
|
+
}
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class Clipper
|
47
|
+
def initialize( excerpt, excerpt_limit )
|
48
|
+
@excerpt = excerpt
|
49
|
+
@excerpt_limit = excerpt_limit
|
50
|
+
end
|
51
|
+
|
52
|
+
def add_next_word
|
53
|
+
if @words_before.empty?
|
54
|
+
add_from_words_after
|
55
|
+
elsif @words_after.empty?
|
56
|
+
add_from_words_before
|
57
|
+
elsif @str_after.size >= @str_before.size
|
58
|
+
add_from_words_before
|
59
|
+
else
|
60
|
+
add_from_words_after
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def add_next_word?
|
65
|
+
( excerpt_parts_under_limit &&
|
66
|
+
!( @words_before.empty? && @words_after.empty? ) )
|
67
|
+
end
|
68
|
+
|
69
|
+
def add_from_words_after
|
70
|
+
if @str_after == ''
|
71
|
+
@str_after = @words_after.shift
|
72
|
+
else
|
73
|
+
@str_after += ' ' + @words_after.shift
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def add_from_words_before
|
78
|
+
if @str_before == ''
|
79
|
+
@str_before = @words_before.pop
|
80
|
+
else
|
81
|
+
@str_before = @words_before.pop + ' ' + @str_before
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def excerpt_parts_under_limit
|
86
|
+
( @str_before.size + @strong_text.size + @str_after.size + 2 ) <
|
87
|
+
@excerpt_limit
|
88
|
+
end
|
89
|
+
|
90
|
+
def execute
|
91
|
+
@excerpt =~ %r{(.*)(<strong>.*?</strong>\S*)(.*)}m
|
92
|
+
@text_before = $1
|
93
|
+
@strong_text = $2
|
94
|
+
@text_after = $3
|
95
|
+
get_elts.join( ' ' )
|
96
|
+
end
|
97
|
+
|
98
|
+
def get_elts
|
99
|
+
@words_before = ArrayOfWordsAndBrTags.new( @text_before )
|
100
|
+
@words_after = ArrayOfWordsAndBrTags.new( @text_after )
|
101
|
+
@str_before = ''
|
102
|
+
@str_after = ''
|
103
|
+
add_next_word while add_next_word?
|
104
|
+
ellipses = '…'
|
105
|
+
elts = []
|
106
|
+
elts << ellipses unless @words_before.empty?
|
107
|
+
elts << @str_before
|
108
|
+
elts << @strong_text
|
109
|
+
elts << @str_after
|
110
|
+
elts << ellipses unless @words_after.empty?
|
111
|
+
elts
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
class Detokenizer
|
116
|
+
def initialize( contents, referred_uri )
|
117
|
+
@contents, @referred_uri = contents, referred_uri;
|
118
|
+
end
|
119
|
+
|
120
|
+
def compact_brs
|
121
|
+
result = 'something'
|
122
|
+
until result.nil?
|
123
|
+
result = @detokenized.gsub!( %r{<br />\s*<br />}, '<br /> ' )
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def execute
|
128
|
+
@detokenized = HtmlWithFixedAttributes.new( @contents )
|
129
|
+
@detokenized.gsub!( /<!--.*?-->/m, '' )
|
130
|
+
@detokenized.gsub!(
|
131
|
+
%r{</?(h\d|p|blockquote|table|tr|br|div|form|ul|li|center|ol|dl|dd|dt|fieldset|option|select|object|o:p).*?>}i,
|
132
|
+
' <br /> '
|
133
|
+
)
|
134
|
+
@detokenized.gsub!(
|
135
|
+
%r{</?(acronym|abbr|strong|td|tt|small|em|img|font|span|input|hr|noscript|legend|address).*?>}im, ''
|
136
|
+
)
|
137
|
+
@detokenized.gsub!( %r{</?(b|i)(\s+.*?)?>}i, '' )
|
138
|
+
substitute_links
|
139
|
+
compact_brs
|
140
|
+
@detokenized.gsub!( /\s+/, ' ' )
|
141
|
+
@detokenized
|
142
|
+
end
|
143
|
+
|
144
|
+
def substitute_links
|
145
|
+
@detokenized.gsub!( %r{<a\s+(.*?)>(.*?)</(xhtml:)?a.*?>}mi ) { |s|
|
146
|
+
a_att = $1
|
147
|
+
a_body = $2
|
148
|
+
if ( a_att =~ /href=('|")?([^'" ]*)("|')?/i )
|
149
|
+
href = $2
|
150
|
+
href.gsub!( /("|')$/, '' )
|
151
|
+
matches_self = ( href == @referred_uri )
|
152
|
+
else
|
153
|
+
matches_self = false
|
154
|
+
end
|
155
|
+
matches_self ? "<strong>#{ a_body }</strong>" : a_body
|
156
|
+
}
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
class HtmlWithFixedAttributes < String
|
161
|
+
def initialize( contents )
|
162
|
+
super( '' )
|
163
|
+
script_stack = []
|
164
|
+
contents.split( %r{(</?script[^>]*>)}i ).each { |script_or_between|
|
165
|
+
if script_or_between =~ /^<script/i
|
166
|
+
script_stack.push( script_or_between )
|
167
|
+
elsif script_or_between =~ %r{</script}i
|
168
|
+
script_stack.pop
|
169
|
+
elsif script_stack.empty?
|
170
|
+
self << fix_attributes( script_or_between )
|
171
|
+
else
|
172
|
+
self << ' '
|
173
|
+
end
|
174
|
+
}
|
175
|
+
end
|
176
|
+
|
177
|
+
def fix_attributes( html_part )
|
178
|
+
fixed = ''
|
179
|
+
html_part.split( /(<.*?)(?=<)/ ).each { |part|
|
180
|
+
part = fix_bracket_plus( part ) if part != '' and part =~ /('|")/
|
181
|
+
fixed << part
|
182
|
+
}
|
183
|
+
fixed
|
184
|
+
end
|
185
|
+
|
186
|
+
def fix_bracket_plus( part )
|
187
|
+
inside_attribute = false
|
188
|
+
between_tags = true
|
189
|
+
last_attribute_quote = nil
|
190
|
+
fixed = ''
|
191
|
+
part.scan( /([^'"<>]*)("|'|<|>)/ ) { |pre_delimiter, delimiter|
|
192
|
+
delimiter.gsub!( />/, '>' ) if inside_attribute
|
193
|
+
if %w( ' " ).include? delimiter and !between_tags
|
194
|
+
if last_attribute_quote == delimiter or !inside_attribute
|
195
|
+
inside_attribute = !inside_attribute
|
196
|
+
last_attribute_quote = delimiter
|
197
|
+
end
|
198
|
+
end
|
199
|
+
between_tags = !between_tags if %w( < > ).include? delimiter
|
200
|
+
fixed << pre_delimiter
|
201
|
+
fixed << delimiter
|
202
|
+
}
|
203
|
+
post_scan = ( part =~ /.*('|"|<|>)(.*)/m ) ? $2 : part
|
204
|
+
fixed << post_scan
|
205
|
+
fixed
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
metadata
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.6
|
3
|
+
specification_version: 1
|
4
|
+
name: htmlclipping
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2005-05-15
|
8
|
+
summary: HtmlClipping generates excerpts from an HTML page that has a link pointing to a particular URI.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: sera@fhwang.net
|
12
|
+
homepage: http://htmlclipping.rubyforge.org/
|
13
|
+
rubyforge_project:
|
14
|
+
description: "HtmlClipping generates excerpts from an HTML page that has a link pointing to a
|
15
|
+
particular URI. It removes most HTML markup, bolds the link text, and trims the
|
16
|
+
resulting text to a fixed number of characters. I developed it to help me track
|
17
|
+
referers to my website, though I suppose it might have other uses."
|
18
|
+
autorequire: htmlclipping
|
19
|
+
default_executable:
|
20
|
+
bindir: bin
|
21
|
+
has_rdoc: false
|
22
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
23
|
+
requirements:
|
24
|
+
-
|
25
|
+
- ">"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: 0.0.0
|
28
|
+
version:
|
29
|
+
platform: ruby
|
30
|
+
authors:
|
31
|
+
- Francis Hwang
|
32
|
+
files:
|
33
|
+
- lib/htmlclipping.rb
|
34
|
+
- lib/htmlclipping.rb~
|
35
|
+
test_files: []
|
36
|
+
rdoc_options: []
|
37
|
+
extra_rdoc_files: []
|
38
|
+
executables: []
|
39
|
+
extensions: []
|
40
|
+
requirements: []
|
41
|
+
dependencies: []
|