autolinker 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,112 @@
1
+ require 'set'
2
+ require 'cgi'
3
+
4
+ module Autolinker
5
+ module HTML
6
+ class Sanitizer
7
+ attr_accessor :protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
8
+ :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties
9
+
10
+ def initialize
11
+ # A regular expression of the valid characters used to separate protocols like
12
+ # the ':' in 'http://foo.com'
13
+ @protocol_separator = /:|(&#0*58)|(&#x70)|(&#x0*3a)|(%|%)3A/i
14
+
15
+ # Specifies a Set of HTML attributes that can have URIs.
16
+ @uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
17
+
18
+ # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
19
+ # to just escaping harmless tags like <font>
20
+ @bad_tags = Set.new(%w(script))
21
+
22
+ # Specifies the default Set of tags that the #sanitize helper will allow unscathed.
23
+ @allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
24
+ sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
25
+ acronym a img blockquote del ins))
26
+
27
+ # Specifies the default Set of html attributes that the #sanitize helper will leave
28
+ # in the allowed tag.
29
+ @allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
30
+
31
+ # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
32
+ @allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
33
+ feed svn urn aim rsync tag ssh sftp rtsp afs))
34
+
35
+ # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
36
+ @allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
37
+ border-color border-left-color border-right-color border-top-color clear color cursor direction display
38
+ elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
39
+ overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
40
+ speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
41
+ width))
42
+
43
+ # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
44
+ @allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
45
+ collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
46
+ nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
47
+
48
+ # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
49
+ @shorthand_css_properties = Set.new(%w(background border margin padding))
50
+ end
51
+
52
+ def sanitize(text, options = {})
53
+ return text unless sanitizeable?(text)
54
+ tokenize(text, options).join
55
+ end
56
+
57
+ def sanitizeable?(text)
58
+ !(text.nil? || text.empty? || !text.index("<"))
59
+ end
60
+
61
+ protected
62
+ def tokenize(text, options)
63
+ options[:parent] = []
64
+ options[:attributes] ||= allowed_attributes
65
+ options[:tags] ||= allowed_tags
66
+
67
+ tokenizer = HTML::Tokenizer.new(text)
68
+ result = []
69
+ while token = tokenizer.next
70
+ node = Node.parse(nil, 0, 0, token, false)
71
+ process_node node, result, options
72
+ end
73
+ result
74
+ end
75
+
76
+ def process_node(node, result, options)
77
+ result << case node
78
+ when HTML::Tag
79
+ if node.closing == :close
80
+ options[:parent].shift
81
+ else
82
+ options[:parent].unshift node.name
83
+ end
84
+
85
+ process_attributes_for node, options
86
+
87
+ options[:tags].include?(node.name) ? node : nil
88
+ else
89
+ bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "&lt;")
90
+ end
91
+ end
92
+
93
+ def process_attributes_for(node, options)
94
+ return unless node.attributes
95
+ node.attributes.keys.each do |attr_name|
96
+ value = node.attributes[attr_name].to_s
97
+
98
+ if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
99
+ node.attributes.delete(attr_name)
100
+ else
101
+ node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(CGI::unescapeHTML(value))
102
+ end
103
+ end
104
+ end
105
+
106
+ def contains_bad_protocols?(attr_name, value)
107
+ uri_attributes.include?(attr_name) &&
108
+ (value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i && !allowed_protocols.include?(value.split(protocol_separator).first.downcase.strip))
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,104 @@
1
+ module Autolinker
2
+ module HTML
3
+ # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
4
+ # token is a string. Each string represents either "text", or an HTML element.
5
+ #
6
+ # This currently assumes valid XHTML, which means no free < or > characters.
7
+ #
8
+ # Usage:
9
+ #
10
+ # tokenizer = HTML::Tokenizer.new(text)
11
+ # while token = tokenizer.next
12
+ # p token
13
+ # end
14
+ class Tokenizer #:nodoc:
15
+
16
+ # The current (byte) position in the text
17
+ attr_reader :position
18
+
19
+ # The current line number
20
+ attr_reader :line
21
+
22
+ # Create a new Tokenizer for the given text.
23
+ def initialize(text)
24
+ @scanner = StringScanner.new(text)
25
+ @position = 0
26
+ @line = 0
27
+ @current_line = 1
28
+ end
29
+
30
+ # Return the next token in the sequence, or +nil+ if there are no more tokens in
31
+ # the stream.
32
+ def next
33
+ return nil if @scanner.eos?
34
+ @position = @scanner.pos
35
+ @line = @current_line
36
+ if @scanner.check(/<\S/)
37
+ update_current_line(scan_tag)
38
+ else
39
+ update_current_line(scan_text)
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ # Treat the text at the current position as a tag, and scan it. Supports
46
+ # comments, doctype tags, and regular tags, and ignores less-than and
47
+ # greater-than characters within quoted strings.
48
+ def scan_tag
49
+ tag = @scanner.getch
50
+ if @scanner.scan(/!--/) # comment
51
+ tag << @scanner.matched
52
+ tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
53
+ elsif @scanner.scan(/!\[CDATA\[/)
54
+ tag << @scanner.matched
55
+ tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
56
+ elsif @scanner.scan(/!/) # doctype
57
+ tag << @scanner.matched
58
+ tag << consume_quoted_regions
59
+ else
60
+ tag << consume_quoted_regions
61
+ end
62
+ tag
63
+ end
64
+
65
+ # Scan all text up to the next < character and return it.
66
+ def scan_text
67
+ "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
68
+ end
69
+
70
+ # Counts the number of newlines in the text and updates the current line
71
+ # accordingly.
72
+ def update_current_line(text)
73
+ text.scan(/\r?\n/) { @current_line += 1 }
74
+ end
75
+
76
+ # Skips over quoted strings, so that less-than and greater-than characters
77
+ # within the strings are ignored.
78
+ def consume_quoted_regions
79
+ text = ""
80
+ loop do
81
+ match = @scanner.scan_until(/['"<>]/) or break
82
+
83
+ delim = @scanner.matched
84
+ if delim == "<"
85
+ match = match.chop
86
+ @scanner.pos -= 1
87
+ end
88
+
89
+ text << match
90
+ break if delim == "<" || delim == ">"
91
+
92
+ # consume the quoted region
93
+ while match = @scanner.scan_until(/[\\#{delim}]/)
94
+ text << match
95
+ break if @scanner.matched == delim
96
+ break if @scanner.eos?
97
+ text << @scanner.getch # skip the escaped character
98
+ end
99
+ end
100
+ text
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,292 @@
1
+ module Autolinker
2
+ class TextHelper
3
+ # Turns all URLs and e-mail addresses into clickable links. The <tt>:link</tt> option
4
+ # will limit what should be linked. You can add HTML attributes to the links using
5
+ # <tt>:html</tt>. Possible values for <tt>:link</tt> are <tt>:all</tt> (default),
6
+ # <tt>:email_addresses</tt>, and <tt>:urls</tt>. If a block is given, each URL and
7
+ # e-mail address is yielded and the result is used as the link text. By default the
8
+ # text given is sanitized, you can override this behaviour setting the
9
+ # <tt>:sanitize</tt> option to false, or you can add options to the sanitization of
10
+ # the text using the <tt>:sanitize_options</tt> option hash.
11
+ #
12
+ # ==== Examples
13
+ # auto_link("Go to http://www.rubyonrails.org and say hello to david@loudthinking.com")
14
+ # # => "Go to <a href=\"http://www.rubyonrails.org\">http://www.rubyonrails.org</a> and
15
+ # # say hello to <a href=\"mailto:david@loudthinking.com\">david@loudthinking.com</a>"
16
+ #
17
+ # auto_link("Visit http://www.loudthinking.com/ or e-mail david@loudthinking.com", :link => :urls)
18
+ # # => "Visit <a href=\"http://www.loudthinking.com/\">http://www.loudthinking.com/</a>
19
+ # # or e-mail david@loudthinking.com"
20
+ #
21
+ # auto_link("Visit http://www.loudthinking.com/ or e-mail david@loudthinking.com", :link => :email_addresses)
22
+ # # => "Visit http://www.loudthinking.com/ or e-mail <a href=\"mailto:david@loudthinking.com\">david@loudthinking.com</a>"
23
+ #
24
+ # post_body = "Welcome to my new blog at http://www.myblog.com/. Please e-mail me at me@email.com."
25
+ # auto_link(post_body, :html => { :target => '_blank' }) do |text|
26
+ # truncate(text, :length => 15)
27
+ # end
28
+ # # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\" target=\"_blank\">http://www.m...</a>.
29
+ # Please e-mail me at <a href=\"mailto:me@email.com\">me@email.com</a>."
30
+ #
31
+ #
32
+ # You can still use <tt>auto_link</tt> with the old API that accepts the
33
+ # +link+ as its optional second parameter and the +html_options+ hash
34
+ # as its optional third parameter:
35
+ # post_body = "Welcome to my new blog at http://www.myblog.com/. Please e-mail me at me@email.com."
36
+ # auto_link(post_body, :urls)
37
+ # # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\">http://www.myblog.com</a>.
38
+ # Please e-mail me at me@email.com."
39
+ #
40
+ # auto_link(post_body, :all, :target => "_blank")
41
+ # # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\" target=\"_blank\">http://www.myblog.com</a>.
42
+ # Please e-mail me at <a href=\"mailto:me@email.com\">me@email.com</a>."
43
+ def auto_link(text, *args, &block) #link = :all, html = {}, &block)
44
+ return '' if text.nil? || text.empty?
45
+
46
+ options = args.size == 2 ? {} : extract_options!(args) # this is necessary because the old auto_link API has a Hash as its last parameter
47
+
48
+ unless args.empty?
49
+ options[:link] = args[0] || :all
50
+ options[:html] = args[1] || {}
51
+ end
52
+ options = { :link => :all, :html => {} }.merge(options)
53
+
54
+ sanitize_options = options[:sanitize_options] || {}
55
+ sanitize = (options[:sanitize] != false)
56
+ text = conditional_sanitize(text, sanitize, sanitize_options).to_str
57
+
58
+ case options[:link].to_sym
59
+ when :all then
60
+ auto_link_email_addresses(auto_link_urls(text, options[:html], options, &block), options[:html], &block)
61
+ when :email_addresses then
62
+ auto_link_email_addresses(text, options[:html], &block)
63
+ when :urls then
64
+ auto_link_urls(text, options[:html], options, &block)
65
+ end
66
+ end
67
+
68
+ private
69
+
70
+ AUTO_LINK_RE = %r{
71
+ (?: ((?:ed2k|ftp|http|https|irc|mailto|news|gopher|nntp|telnet|webcal|xmpp|callto|feed|svn|urn|aim|rsync|tag|ssh|sftp|rtsp|afs|file):)// | www\. )
72
+ [^\s<\u00A0]+
73
+ }x
74
+
75
+ # regexps for determining context, used high-volume
76
+ AUTO_LINK_CRE = [/<[^>]+$/, /^[^>]*>/, /<a\b.*?>/i, /<\/a>/i]
77
+
78
+ AUTO_EMAIL_LOCAL_RE = /[\w.!#\$%&'*\/=?^`{|}~+-]/
79
+ AUTO_EMAIL_RE = /[\w.!#\$%+-]\.?(?:#{AUTO_EMAIL_LOCAL_RE}+\.)*#{AUTO_EMAIL_LOCAL_RE}*@[\w-]+(?:\.[\w-]+)+/
80
+
81
+ BRACKETS = { ']' => '[', ')' => '(', '}' => '{' }
82
+
83
+ WORD_PATTERN = RUBY_VERSION < '1.9' ? '\w' : '\p{Word}'
84
+
85
+ # Turns all urls into clickable links. If a block is given, each url
86
+ # is yielded and the result is used as the link text.
87
+ def auto_link_urls(text, link_attributes = {}, options = {})
88
+ text.gsub(AUTO_LINK_RE) do
89
+ scheme, href = $1, $&
90
+ punctuation = []
91
+
92
+ if auto_linked?($`, $')
93
+ # do not change string; URL is already linked
94
+ href
95
+ else
96
+ # don't include trailing punctuation character as part of the URL
97
+ while href.sub!(/[^#{WORD_PATTERN}\/-]$/, '')
98
+ punctuation.push $&
99
+ if opening = BRACKETS[punctuation.last] and href.scan(opening).size > href.scan(punctuation.last).size
100
+ href << punctuation.pop
101
+ break
102
+ end
103
+ end
104
+
105
+ link_text = block_given? ? yield(href) : href
106
+ href = 'http://' + href unless scheme
107
+
108
+ unless options[:sanitize] == false
109
+ link_text = sanitize(link_text)
110
+ href = sanitize(href)
111
+ end
112
+ #"<a href='#{link_attributes.merge('href' => href)}'>#{link_text}</a>"
113
+ content_tag(:a, link_text, link_attributes.merge('href' => href), !!options[:sanitize]) + punctuation.reverse.join('')
114
+ end
115
+ end
116
+ end
117
+
118
+ # Turns all email addresses into clickable links. If a block is given,
119
+ # each email is yielded and the result is used as the link text.
120
+ def auto_link_email_addresses(text, html_options = {}, options = {})
121
+ text.gsub(AUTO_EMAIL_RE) do
122
+ text = $&
123
+
124
+ if auto_linked?($`, $')
125
+ text
126
+ else
127
+ display_text = (block_given?) ? yield(text) : text
128
+
129
+ unless options[:sanitize] == false
130
+ text = sanitize(text)
131
+ display_text = sanitize(display_text) unless text == display_text
132
+ end
133
+ mail_to text, display_text, html_options
134
+ end
135
+ end
136
+ end
137
+
138
+ def extract_options!(args)
139
+ if args.last.is_a?(Hash)
140
+ args.pop
141
+ else
142
+ {}
143
+ end
144
+ end
145
+
146
+ # Detects already linked context or position in the middle of a tag
147
+ def auto_linked?(left, right)
148
+ (left =~ AUTO_LINK_CRE[0] and right =~ AUTO_LINK_CRE[1]) or
149
+ (left.rindex(AUTO_LINK_CRE[2]) and $' !~ AUTO_LINK_CRE[3])
150
+ end
151
+
152
+ def conditional_sanitize(target, condition, sanitize_options = {})
153
+ condition ? sanitize(target, sanitize_options) : target
154
+ end
155
+
156
+ def sanitize(html, options = {})
157
+ Autolinker::HTML::Sanitizer.new.sanitize(html, options)
158
+ end
159
+
160
+ def content_tag(name, content_or_options_with_block = nil, options = nil, escape = true, &block)
161
+ if block_given?
162
+ options = content_or_options_with_block if content_or_options_with_block.is_a?(Hash)
163
+ content_tag_string(name, capture(&block), options, escape)
164
+ else
165
+ content_tag_string(name, content_or_options_with_block, options, escape)
166
+ end
167
+ end
168
+
169
+ def content_tag_string(name, content, options, escape = true)
170
+ tag_options = tag_options(options, escape) if options
171
+ "<#{name}#{tag_options}>#{PRE_CONTENT_STRINGS[name.to_sym]}#{escape ? ERB::Util.h(content) : content}</#{name}>"
172
+ end
173
+
174
+ BOOLEAN_ATTRIBUTES = %w(disabled readonly multiple checked autobuffer
175
+ autoplay controls loop selected hidden scoped async
176
+ defer reversed ismap seemless muted required
177
+ autofocus novalidate formnovalidate open pubdate).to_set
178
+ BOOLEAN_ATTRIBUTES.merge(BOOLEAN_ATTRIBUTES.map { |attribute| attribute.to_sym })
179
+
180
+ PRE_CONTENT_STRINGS = {
181
+ :textarea => "\n"
182
+ }
183
+
184
+ def tag_options(options, escape = true)
185
+ unless options.nil? || options.empty?
186
+ attrs = []
187
+ options.each_pair do |key, value|
188
+ if key.to_s == 'data' && value.is_a?(Hash)
189
+ value.each do |k, v|
190
+ unless v.is_a?(String) || v.is_a?(Symbol) || v.is_a?(BigDecimal)
191
+ v = v.to_json
192
+ end
193
+ v = ERB::Util.html_escape(v) if escape
194
+ attrs << %(data-#{k.to_s.dasherize}="#{v}")
195
+ end
196
+ elsif BOOLEAN_ATTRIBUTES.include?(key)
197
+ attrs << %(#{key}="#{key}") if value
198
+ elsif !value.nil?
199
+ final_value = value.is_a?(Array) ? value.join(" ") : value
200
+ final_value = ERB::Util.html_escape(final_value) if escape
201
+ attrs << %(#{key}="#{final_value}")
202
+ end
203
+ end
204
+ " #{attrs.sort * ' '}" unless attrs.empty?
205
+ end
206
+ end
207
+
208
+ # Creates a mailto link tag to the specified +email_address+, which is
209
+ # also used as the name of the link unless +name+ is specified. Additional
210
+ # HTML attributes for the link can be passed in +html_options+.
211
+ #
212
+ # +mail_to+ has several methods for hindering email harvesters and customizing
213
+ # the email itself by passing special keys to +html_options+.
214
+ #
215
+ # ==== Options
216
+ # * <tt>:encode</tt> - This key will accept the strings "javascript" or "hex".
217
+ # Passing "javascript" will dynamically create and encode the mailto link then
218
+ # eval it into the DOM of the page. This method will not show the link on
219
+ # the page if the user has JavaScript disabled. Passing "hex" will hex
220
+ # encode the +email_address+ before outputting the mailto link.
221
+ # * <tt>:replace_at</tt> - When the link +name+ isn't provided, the
222
+ # +email_address+ is used for the link label. You can use this option to
223
+ # obfuscate the +email_address+ by substituting the @ sign with the string
224
+ # given as the value.
225
+ # * <tt>:replace_dot</tt> - When the link +name+ isn't provided, the
226
+ # +email_address+ is used for the link label. You can use this option to
227
+ # obfuscate the +email_address+ by substituting the . in the email with the
228
+ # string given as the value.
229
+ # * <tt>:subject</tt> - Preset the subject line of the email.
230
+ # * <tt>:body</tt> - Preset the body of the email.
231
+ # * <tt>:cc</tt> - Carbon Copy additional recipients on the email.
232
+ # * <tt>:bcc</tt> - Blind Carbon Copy additional recipients on the email.
233
+ #
234
+ # ==== Examples
235
+ # mail_to "me@domain.com"
236
+ # # => <a href="mailto:me@domain.com">me@domain.com</a>
237
+ #
238
+ # mail_to "me@domain.com", "My email", :encode => "javascript"
239
+ # # => <script type="text/javascript">eval(decodeURIComponent('%64%6f%63...%27%29%3b'))</script>
240
+ #
241
+ # mail_to "me@domain.com", "My email", :encode => "hex"
242
+ # # => <a href="mailto:%6d%65@%64%6f%6d%61%69%6e.%63%6f%6d">My email</a>
243
+ #
244
+ # mail_to "me@domain.com", nil, :replace_at => "_at_", :replace_dot => "_dot_", :class => "email"
245
+ # # => <a href="mailto:me@domain.com" class="email">me_at_domain_dot_com</a>
246
+ #
247
+ # mail_to "me@domain.com", "My email", :cc => "ccaddress@domain.com",
248
+ # :subject => "This is an example email"
249
+ # # => <a href="mailto:me@domain.com?cc=ccaddress@domain.com&subject=This%20is%20an%20example%20email">My email</a>
250
+ def mail_to(email_address, name = nil, html_options = {})
251
+ email_address = ERB::Util.html_escape(email_address)
252
+
253
+ encode = html_options.delete("encode").to_s
254
+
255
+ extras = %w{ cc bcc body subject }.map { |item|
256
+ option = html_options.delete(item) || next
257
+ "#{item}=#{Rack::Utils.escape(option).gsub("+", "%20")}"
258
+ }.compact
259
+ extras = extras.empty? ? '' : '?' + ERB::Util.html_escape(extras.join('&'))
260
+
261
+ email_address_obfuscated = email_address.to_str
262
+ email_address_obfuscated.gsub!(/@/, html_options.delete("replace_at")) if html_options.key?("replace_at")
263
+ email_address_obfuscated.gsub!(/\./, html_options.delete("replace_dot")) if html_options.key?("replace_dot")
264
+ case encode
265
+ when "javascript"
266
+ string = ''
267
+ html = content_tag("a", name || email_address_obfuscated, html_options.merge("href" => "mailto:#{email_address}#{extras}"))
268
+ html = escape_javascript(html.to_str)
269
+ "document.write('#{html}');".each_byte do |c|
270
+ string << sprintf("%%%x", c)
271
+ end
272
+ "<script type=\"#{Mime::JS}\">eval(decodeURIComponent('#{string}'))</script>"
273
+ when "hex"
274
+ email_address_encoded = email_address_obfuscated.unpack('C*').map { |c|
275
+ sprintf("&#%d;", c)
276
+ }.join
277
+
278
+ string = 'mailto:'.unpack('C*').map { |c|
279
+ sprintf("&#%d;", c)
280
+ }.join + email_address.unpack('C*').map { |c|
281
+ char = c.chr
282
+ char =~ /\w/ ? sprintf("%%%x", c) : char
283
+ }.join
284
+
285
+ content_tag "a", name || email_address_encoded, html_options.merge("href" => "#{string}#{extras}")
286
+ else
287
+ content_tag "a", name || email_address_obfuscated, html_options.merge("href" => "mailto:#{email_address}#{extras}")
288
+ end
289
+ end
290
+
291
+ end
292
+ end