autolinker 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,112 @@
1
+ require 'set'
2
+ require 'cgi'
3
+
4
+ module Autolinker
5
+ module HTML
6
+ class Sanitizer
7
+ attr_accessor :protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
8
+ :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties
9
+
10
+ def initialize
11
+ # A regular expression of the valid characters used to separate protocols like
12
+ # the ':' in 'http://foo.com'
13
+ @protocol_separator = /:|(&#0*58)|(&#x70)|(&#x0*3a)|(%|%)3A/i
14
+
15
+ # Specifies a Set of HTML attributes that can have URIs.
16
+ @uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
17
+
18
+ # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
19
+ # to just escaping harmless tags like <font>
20
+ @bad_tags = Set.new(%w(script))
21
+
22
+ # Specifies the default Set of tags that the #sanitize helper will allow unscathed.
23
+ @allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
24
+ sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
25
+ acronym a img blockquote del ins))
26
+
27
+ # Specifies the default Set of html attributes that the #sanitize helper will leave
28
+ # in the allowed tag.
29
+ @allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
30
+
31
+ # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
32
+ @allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
33
+ feed svn urn aim rsync tag ssh sftp rtsp afs))
34
+
35
+ # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
36
+ @allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
37
+ border-color border-left-color border-right-color border-top-color clear color cursor direction display
38
+ elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
39
+ overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
40
+ speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
41
+ width))
42
+
43
+ # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
44
+ @allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
45
+ collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
46
+ nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
47
+
48
+ # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
49
+ @shorthand_css_properties = Set.new(%w(background border margin padding))
50
+ end
51
+
52
+ def sanitize(text, options = {})
53
+ return text unless sanitizeable?(text)
54
+ tokenize(text, options).join
55
+ end
56
+
57
+ def sanitizeable?(text)
58
+ !(text.nil? || text.empty? || !text.index("<"))
59
+ end
60
+
61
+ protected
62
+ def tokenize(text, options)
63
+ options[:parent] = []
64
+ options[:attributes] ||= allowed_attributes
65
+ options[:tags] ||= allowed_tags
66
+
67
+ tokenizer = HTML::Tokenizer.new(text)
68
+ result = []
69
+ while token = tokenizer.next
70
+ node = Node.parse(nil, 0, 0, token, false)
71
+ process_node node, result, options
72
+ end
73
+ result
74
+ end
75
+
76
+ def process_node(node, result, options)
77
+ result << case node
78
+ when HTML::Tag
79
+ if node.closing == :close
80
+ options[:parent].shift
81
+ else
82
+ options[:parent].unshift node.name
83
+ end
84
+
85
+ process_attributes_for node, options
86
+
87
+ options[:tags].include?(node.name) ? node : nil
88
+ else
89
+ bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "&lt;")
90
+ end
91
+ end
92
+
93
+ def process_attributes_for(node, options)
94
+ return unless node.attributes
95
+ node.attributes.keys.each do |attr_name|
96
+ value = node.attributes[attr_name].to_s
97
+
98
+ if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
99
+ node.attributes.delete(attr_name)
100
+ else
101
+ node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(CGI::unescapeHTML(value))
102
+ end
103
+ end
104
+ end
105
+
106
+ def contains_bad_protocols?(attr_name, value)
107
+ uri_attributes.include?(attr_name) &&
108
+ (value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i && !allowed_protocols.include?(value.split(protocol_separator).first.downcase.strip))
109
+ end
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,104 @@
1
+ module Autolinker
2
+ module HTML
3
+ # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
4
+ # token is a string. Each string represents either "text", or an HTML element.
5
+ #
6
+ # This currently assumes valid XHTML, which means no free < or > characters.
7
+ #
8
+ # Usage:
9
+ #
10
+ # tokenizer = HTML::Tokenizer.new(text)
11
+ # while token = tokenizer.next
12
+ # p token
13
+ # end
14
+ class Tokenizer #:nodoc:
15
+
16
+ # The current (byte) position in the text
17
+ attr_reader :position
18
+
19
+ # The current line number
20
+ attr_reader :line
21
+
22
+ # Create a new Tokenizer for the given text.
23
+ def initialize(text)
24
+ @scanner = StringScanner.new(text)
25
+ @position = 0
26
+ @line = 0
27
+ @current_line = 1
28
+ end
29
+
30
+ # Return the next token in the sequence, or +nil+ if there are no more tokens in
31
+ # the stream.
32
+ def next
33
+ return nil if @scanner.eos?
34
+ @position = @scanner.pos
35
+ @line = @current_line
36
+ if @scanner.check(/<\S/)
37
+ update_current_line(scan_tag)
38
+ else
39
+ update_current_line(scan_text)
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ # Treat the text at the current position as a tag, and scan it. Supports
46
+ # comments, doctype tags, and regular tags, and ignores less-than and
47
+ # greater-than characters within quoted strings.
48
+ def scan_tag
49
+ tag = @scanner.getch
50
+ if @scanner.scan(/!--/) # comment
51
+ tag << @scanner.matched
52
+ tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
53
+ elsif @scanner.scan(/!\[CDATA\[/)
54
+ tag << @scanner.matched
55
+ tag << (@scanner.scan_until(/\]\]>/) || @scanner.scan_until(/\Z/))
56
+ elsif @scanner.scan(/!/) # doctype
57
+ tag << @scanner.matched
58
+ tag << consume_quoted_regions
59
+ else
60
+ tag << consume_quoted_regions
61
+ end
62
+ tag
63
+ end
64
+
65
+ # Scan all text up to the next < character and return it.
66
+ def scan_text
67
+ "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
68
+ end
69
+
70
+ # Counts the number of newlines in the text and updates the current line
71
+ # accordingly.
72
+ def update_current_line(text)
73
+ text.scan(/\r?\n/) { @current_line += 1 }
74
+ end
75
+
76
+ # Skips over quoted strings, so that less-than and greater-than characters
77
+ # within the strings are ignored.
78
+ def consume_quoted_regions
79
+ text = ""
80
+ loop do
81
+ match = @scanner.scan_until(/['"<>]/) or break
82
+
83
+ delim = @scanner.matched
84
+ if delim == "<"
85
+ match = match.chop
86
+ @scanner.pos -= 1
87
+ end
88
+
89
+ text << match
90
+ break if delim == "<" || delim == ">"
91
+
92
+ # consume the quoted region
93
+ while match = @scanner.scan_until(/[\\#{delim}]/)
94
+ text << match
95
+ break if @scanner.matched == delim
96
+ break if @scanner.eos?
97
+ text << @scanner.getch # skip the escaped character
98
+ end
99
+ end
100
+ text
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,292 @@
1
+ module Autolinker
2
+ class TextHelper
3
+ # Turns all URLs and e-mail addresses into clickable links. The <tt>:link</tt> option
4
+ # will limit what should be linked. You can add HTML attributes to the links using
5
+ # <tt>:html</tt>. Possible values for <tt>:link</tt> are <tt>:all</tt> (default),
6
+ # <tt>:email_addresses</tt>, and <tt>:urls</tt>. If a block is given, each URL and
7
+ # e-mail address is yielded and the result is used as the link text. By default the
8
+ # text given is sanitized, you can override this behaviour setting the
9
+ # <tt>:sanitize</tt> option to false, or you can add options to the sanitization of
10
+ # the text using the <tt>:sanitize_options</tt> option hash.
11
+ #
12
+ # ==== Examples
13
+ # auto_link("Go to http://www.rubyonrails.org and say hello to david@loudthinking.com")
14
+ # # => "Go to <a href=\"http://www.rubyonrails.org\">http://www.rubyonrails.org</a> and
15
+ # # say hello to <a href=\"mailto:david@loudthinking.com\">david@loudthinking.com</a>"
16
+ #
17
+ # auto_link("Visit http://www.loudthinking.com/ or e-mail david@loudthinking.com", :link => :urls)
18
+ # # => "Visit <a href=\"http://www.loudthinking.com/\">http://www.loudthinking.com/</a>
19
+ # # or e-mail david@loudthinking.com"
20
+ #
21
+ # auto_link("Visit http://www.loudthinking.com/ or e-mail david@loudthinking.com", :link => :email_addresses)
22
+ # # => "Visit http://www.loudthinking.com/ or e-mail <a href=\"mailto:david@loudthinking.com\">david@loudthinking.com</a>"
23
+ #
24
+ # post_body = "Welcome to my new blog at http://www.myblog.com/. Please e-mail me at me@email.com."
25
+ # auto_link(post_body, :html => { :target => '_blank' }) do |text|
26
+ # truncate(text, :length => 15)
27
+ # end
28
+ # # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\" target=\"_blank\">http://www.m...</a>.
29
+ # Please e-mail me at <a href=\"mailto:me@email.com\">me@email.com</a>."
30
+ #
31
+ #
32
+ # You can still use <tt>auto_link</tt> with the old API that accepts the
33
+ # +link+ as its optional second parameter and the +html_options+ hash
34
+ # as its optional third parameter:
35
+ # post_body = "Welcome to my new blog at http://www.myblog.com/. Please e-mail me at me@email.com."
36
+ # auto_link(post_body, :urls)
37
+ # # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\">http://www.myblog.com</a>.
38
+ # Please e-mail me at me@email.com."
39
+ #
40
+ # auto_link(post_body, :all, :target => "_blank")
41
+ # # => "Welcome to my new blog at <a href=\"http://www.myblog.com/\" target=\"_blank\">http://www.myblog.com</a>.
42
+ # Please e-mail me at <a href=\"mailto:me@email.com\">me@email.com</a>."
43
+ def auto_link(text, *args, &block) #link = :all, html = {}, &block)
44
+ return '' if text.nil? || text.empty?
45
+
46
+ options = args.size == 2 ? {} : extract_options!(args) # this is necessary because the old auto_link API has a Hash as its last parameter
47
+
48
+ unless args.empty?
49
+ options[:link] = args[0] || :all
50
+ options[:html] = args[1] || {}
51
+ end
52
+ options = { :link => :all, :html => {} }.merge(options)
53
+
54
+ sanitize_options = options[:sanitize_options] || {}
55
+ sanitize = (options[:sanitize] != false)
56
+ text = conditional_sanitize(text, sanitize, sanitize_options).to_str
57
+
58
+ case options[:link].to_sym
59
+ when :all then
60
+ auto_link_email_addresses(auto_link_urls(text, options[:html], options, &block), options[:html], &block)
61
+ when :email_addresses then
62
+ auto_link_email_addresses(text, options[:html], &block)
63
+ when :urls then
64
+ auto_link_urls(text, options[:html], options, &block)
65
+ end
66
+ end
67
+
68
+ private
69
+
70
+ AUTO_LINK_RE = %r{
71
+ (?: ((?:ed2k|ftp|http|https|irc|mailto|news|gopher|nntp|telnet|webcal|xmpp|callto|feed|svn|urn|aim|rsync|tag|ssh|sftp|rtsp|afs|file):)// | www\. )
72
+ [^\s<\u00A0]+
73
+ }x
74
+
75
+ # regexps for determining context, used high-volume
76
+ AUTO_LINK_CRE = [/<[^>]+$/, /^[^>]*>/, /<a\b.*?>/i, /<\/a>/i]
77
+
78
+ AUTO_EMAIL_LOCAL_RE = /[\w.!#\$%&'*\/=?^`{|}~+-]/
79
+ AUTO_EMAIL_RE = /[\w.!#\$%+-]\.?(?:#{AUTO_EMAIL_LOCAL_RE}+\.)*#{AUTO_EMAIL_LOCAL_RE}*@[\w-]+(?:\.[\w-]+)+/
80
+
81
+ BRACKETS = { ']' => '[', ')' => '(', '}' => '{' }
82
+
83
+ WORD_PATTERN = RUBY_VERSION < '1.9' ? '\w' : '\p{Word}'
84
+
85
+ # Turns all urls into clickable links. If a block is given, each url
86
+ # is yielded and the result is used as the link text.
87
+ def auto_link_urls(text, link_attributes = {}, options = {})
88
+ text.gsub(AUTO_LINK_RE) do
89
+ scheme, href = $1, $&
90
+ punctuation = []
91
+
92
+ if auto_linked?($`, $')
93
+ # do not change string; URL is already linked
94
+ href
95
+ else
96
+ # don't include trailing punctuation character as part of the URL
97
+ while href.sub!(/[^#{WORD_PATTERN}\/-]$/, '')
98
+ punctuation.push $&
99
+ if opening = BRACKETS[punctuation.last] and href.scan(opening).size > href.scan(punctuation.last).size
100
+ href << punctuation.pop
101
+ break
102
+ end
103
+ end
104
+
105
+ link_text = block_given? ? yield(href) : href
106
+ href = 'http://' + href unless scheme
107
+
108
+ unless options[:sanitize] == false
109
+ link_text = sanitize(link_text)
110
+ href = sanitize(href)
111
+ end
112
+ #"<a href='#{link_attributes.merge('href' => href)}'>#{link_text}</a>"
113
+ content_tag(:a, link_text, link_attributes.merge('href' => href), !!options[:sanitize]) + punctuation.reverse.join('')
114
+ end
115
+ end
116
+ end
117
+
118
+ # Turns all email addresses into clickable links. If a block is given,
119
+ # each email is yielded and the result is used as the link text.
120
+ def auto_link_email_addresses(text, html_options = {}, options = {})
121
+ text.gsub(AUTO_EMAIL_RE) do
122
+ text = $&
123
+
124
+ if auto_linked?($`, $')
125
+ text
126
+ else
127
+ display_text = (block_given?) ? yield(text) : text
128
+
129
+ unless options[:sanitize] == false
130
+ text = sanitize(text)
131
+ display_text = sanitize(display_text) unless text == display_text
132
+ end
133
+ mail_to text, display_text, html_options
134
+ end
135
+ end
136
+ end
137
+
138
+ def extract_options!(args)
139
+ if args.last.is_a?(Hash)
140
+ args.pop
141
+ else
142
+ {}
143
+ end
144
+ end
145
+
146
+ # Detects already linked context or position in the middle of a tag
147
+ def auto_linked?(left, right)
148
+ (left =~ AUTO_LINK_CRE[0] and right =~ AUTO_LINK_CRE[1]) or
149
+ (left.rindex(AUTO_LINK_CRE[2]) and $' !~ AUTO_LINK_CRE[3])
150
+ end
151
+
152
+ def conditional_sanitize(target, condition, sanitize_options = {})
153
+ condition ? sanitize(target, sanitize_options) : target
154
+ end
155
+
156
+ def sanitize(html, options = {})
157
+ Autolinker::HTML::Sanitizer.new.sanitize(html, options)
158
+ end
159
+
160
+ def content_tag(name, content_or_options_with_block = nil, options = nil, escape = true, &block)
161
+ if block_given?
162
+ options = content_or_options_with_block if content_or_options_with_block.is_a?(Hash)
163
+ content_tag_string(name, capture(&block), options, escape)
164
+ else
165
+ content_tag_string(name, content_or_options_with_block, options, escape)
166
+ end
167
+ end
168
+
169
+ def content_tag_string(name, content, options, escape = true)
170
+ tag_options = tag_options(options, escape) if options
171
+ "<#{name}#{tag_options}>#{PRE_CONTENT_STRINGS[name.to_sym]}#{escape ? ERB::Util.h(content) : content}</#{name}>"
172
+ end
173
+
174
+ BOOLEAN_ATTRIBUTES = %w(disabled readonly multiple checked autobuffer
175
+ autoplay controls loop selected hidden scoped async
176
+ defer reversed ismap seemless muted required
177
+ autofocus novalidate formnovalidate open pubdate).to_set
178
+ BOOLEAN_ATTRIBUTES.merge(BOOLEAN_ATTRIBUTES.map { |attribute| attribute.to_sym })
179
+
180
+ PRE_CONTENT_STRINGS = {
181
+ :textarea => "\n"
182
+ }
183
+
184
+ def tag_options(options, escape = true)
185
+ unless options.nil? || options.empty?
186
+ attrs = []
187
+ options.each_pair do |key, value|
188
+ if key.to_s == 'data' && value.is_a?(Hash)
189
+ value.each do |k, v|
190
+ unless v.is_a?(String) || v.is_a?(Symbol) || v.is_a?(BigDecimal)
191
+ v = v.to_json
192
+ end
193
+ v = ERB::Util.html_escape(v) if escape
194
+ attrs << %(data-#{k.to_s.dasherize}="#{v}")
195
+ end
196
+ elsif BOOLEAN_ATTRIBUTES.include?(key)
197
+ attrs << %(#{key}="#{key}") if value
198
+ elsif !value.nil?
199
+ final_value = value.is_a?(Array) ? value.join(" ") : value
200
+ final_value = ERB::Util.html_escape(final_value) if escape
201
+ attrs << %(#{key}="#{final_value}")
202
+ end
203
+ end
204
+ " #{attrs.sort * ' '}" unless attrs.empty?
205
+ end
206
+ end
207
+
208
+ # Creates a mailto link tag to the specified +email_address+, which is
209
+ # also used as the name of the link unless +name+ is specified. Additional
210
+ # HTML attributes for the link can be passed in +html_options+.
211
+ #
212
+ # +mail_to+ has several methods for hindering email harvesters and customizing
213
+ # the email itself by passing special keys to +html_options+.
214
+ #
215
+ # ==== Options
216
+ # * <tt>:encode</tt> - This key will accept the strings "javascript" or "hex".
217
+ # Passing "javascript" will dynamically create and encode the mailto link then
218
+ # eval it into the DOM of the page. This method will not show the link on
219
+ # the page if the user has JavaScript disabled. Passing "hex" will hex
220
+ # encode the +email_address+ before outputting the mailto link.
221
+ # * <tt>:replace_at</tt> - When the link +name+ isn't provided, the
222
+ # +email_address+ is used for the link label. You can use this option to
223
+ # obfuscate the +email_address+ by substituting the @ sign with the string
224
+ # given as the value.
225
+ # * <tt>:replace_dot</tt> - When the link +name+ isn't provided, the
226
+ # +email_address+ is used for the link label. You can use this option to
227
+ # obfuscate the +email_address+ by substituting the . in the email with the
228
+ # string given as the value.
229
+ # * <tt>:subject</tt> - Preset the subject line of the email.
230
+ # * <tt>:body</tt> - Preset the body of the email.
231
+ # * <tt>:cc</tt> - Carbon Copy additional recipients on the email.
232
+ # * <tt>:bcc</tt> - Blind Carbon Copy additional recipients on the email.
233
+ #
234
+ # ==== Examples
235
+ # mail_to "me@domain.com"
236
+ # # => <a href="mailto:me@domain.com">me@domain.com</a>
237
+ #
238
+ # mail_to "me@domain.com", "My email", :encode => "javascript"
239
+ # # => <script type="text/javascript">eval(decodeURIComponent('%64%6f%63...%27%29%3b'))</script>
240
+ #
241
+ # mail_to "me@domain.com", "My email", :encode => "hex"
242
+ # # => <a href="mailto:%6d%65@%64%6f%6d%61%69%6e.%63%6f%6d">My email</a>
243
+ #
244
+ # mail_to "me@domain.com", nil, :replace_at => "_at_", :replace_dot => "_dot_", :class => "email"
245
+ # # => <a href="mailto:me@domain.com" class="email">me_at_domain_dot_com</a>
246
+ #
247
+ # mail_to "me@domain.com", "My email", :cc => "ccaddress@domain.com",
248
+ # :subject => "This is an example email"
249
+ # # => <a href="mailto:me@domain.com?cc=ccaddress@domain.com&subject=This%20is%20an%20example%20email">My email</a>
250
+ def mail_to(email_address, name = nil, html_options = {})
251
+ email_address = ERB::Util.html_escape(email_address)
252
+
253
+ encode = html_options.delete("encode").to_s
254
+
255
+ extras = %w{ cc bcc body subject }.map { |item|
256
+ option = html_options.delete(item) || next
257
+ "#{item}=#{Rack::Utils.escape(option).gsub("+", "%20")}"
258
+ }.compact
259
+ extras = extras.empty? ? '' : '?' + ERB::Util.html_escape(extras.join('&'))
260
+
261
+ email_address_obfuscated = email_address.to_str
262
+ email_address_obfuscated.gsub!(/@/, html_options.delete("replace_at")) if html_options.key?("replace_at")
263
+ email_address_obfuscated.gsub!(/\./, html_options.delete("replace_dot")) if html_options.key?("replace_dot")
264
+ case encode
265
+ when "javascript"
266
+ string = ''
267
+ html = content_tag("a", name || email_address_obfuscated, html_options.merge("href" => "mailto:#{email_address}#{extras}"))
268
+ html = escape_javascript(html.to_str)
269
+ "document.write('#{html}');".each_byte do |c|
270
+ string << sprintf("%%%x", c)
271
+ end
272
+ "<script type=\"#{Mime::JS}\">eval(decodeURIComponent('#{string}'))</script>"
273
+ when "hex"
274
+ email_address_encoded = email_address_obfuscated.unpack('C*').map { |c|
275
+ sprintf("&#%d;", c)
276
+ }.join
277
+
278
+ string = 'mailto:'.unpack('C*').map { |c|
279
+ sprintf("&#%d;", c)
280
+ }.join + email_address.unpack('C*').map { |c|
281
+ char = c.chr
282
+ char =~ /\w/ ? sprintf("%%%x", c) : char
283
+ }.join
284
+
285
+ content_tag "a", name || email_address_encoded, html_options.merge("href" => "#{string}#{extras}")
286
+ else
287
+ content_tag "a", name || email_address_obfuscated, html_options.merge("href" => "mailto:#{email_address}#{extras}")
288
+ end
289
+ end
290
+
291
+ end
292
+ end