twitter-text 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +13 -0
- data/README.rdoc +58 -0
- data/Rakefile +92 -0
- data/TODO +3 -0
- data/lib/autolink.rb +101 -0
- data/lib/extractor.rb +69 -0
- data/lib/regex.rb +74 -0
- data/lib/twitter-text.rb +13 -0
- data/lib/unicode.rb +27 -0
- data/lib/validation.rb +51 -0
- data/spec/autolinking_spec.rb +427 -0
- data/spec/extractor_spec.rb +195 -0
- data/spec/regex_spec.rb +44 -0
- data/spec/spec_helper.rb +86 -0
- data/spec/unicode_spec.rb +30 -0
- data/spec/validation_spec.rb +42 -0
- metadata +79 -0
data/LICENSE
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Copyright 2010 Twitter, Inc.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
4
|
+
use this file except in compliance with the License. You may obtain a copy of
|
5
|
+
the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
11
|
+
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
12
|
+
License for the specific language governing permissions and limitations under
|
13
|
+
the License.
|
data/README.rdoc
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
== twitter-text
|
2
|
+
|
3
|
+
A gem that provides text processing routines for Twitter Tweets. The major
|
4
|
+
reason for this is to unify the various auto-linking and extraction of
|
5
|
+
usernames, lists, hashtags and URLs.
|
6
|
+
|
7
|
+
== Extraction Examples
|
8
|
+
|
9
|
+
# Extraction
|
10
|
+
class MyClass
|
11
|
+
include Twitter::Extractor
|
12
|
+
usernames = extract_mentioned_screen_names("Mentioning @twitter and @jack")
|
13
|
+
# usernames = ["twitter", "jack"]
|
14
|
+
end
|
15
|
+
|
16
|
+
# Extraction with a block argument
|
17
|
+
class MyClass
|
18
|
+
include Twitter::Extractor
|
19
|
+
extract_reply_screen_name("@twitter are you hiring?").do |username|
|
20
|
+
# username = "twitter"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
== Auto-linking Examples
|
25
|
+
|
26
|
+
# Auto-link
|
27
|
+
class MyClass
|
28
|
+
include Twitter::Autolink
|
29
|
+
|
30
|
+
html = auto_link("link @user, please #request")
|
31
|
+
end
|
32
|
+
|
33
|
+
=== Usernames
|
34
|
+
|
35
|
+
Username extraction and linking matches all valid Twitter usernames but does
|
36
|
+
not verify that the username is a valid Twitter account.
|
37
|
+
|
38
|
+
=== Lists
|
39
|
+
|
40
|
+
Auto-link and extract list names when they are written in @user/list-name
|
41
|
+
format.
|
42
|
+
|
43
|
+
=== Hashtags
|
44
|
+
|
45
|
+
Auto-link and extract hashtags, where a hashtag contains any latin letter or
|
46
|
+
number but cannot be solely numbers.
|
47
|
+
|
48
|
+
=== URLs
|
49
|
+
|
50
|
+
Auto-linking and extraction of URLs differs from the Rails default so that it
|
51
|
+
will work correctly in Tweets written in languages that do not include spaces
|
52
|
+
between words.
|
53
|
+
|
54
|
+
=== International
|
55
|
+
|
56
|
+
Special care has been taken to be sure that auto-linking and extraction work
|
57
|
+
in Tweets of all languages. This means that languages without spaces between
|
58
|
+
words should work equally well.
|
data/Rakefile
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'rubygems' unless ENV['NO_RUBYGEMS']
|
2
|
+
require 'rake/gempackagetask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
require 'rubygems/specification'
|
5
|
+
require 'date'
|
6
|
+
require 'spec/rake/spectask'
|
7
|
+
require 'spec/rake/verify_rcov'
|
8
|
+
|
9
|
+
|
10
|
+
spec = Gem::Specification.new do |s|
|
11
|
+
s.name = "twitter-text"
|
12
|
+
s.version = "1.0"
|
13
|
+
s.author = "Matt Sanford"
|
14
|
+
s.email = "matt@twitter.com"
|
15
|
+
s.homepage = "http://twitter.com"
|
16
|
+
s.description = s.summary = "A gem that provides text handling for Twitter"
|
17
|
+
|
18
|
+
s.platform = Gem::Platform::RUBY
|
19
|
+
s.has_rdoc = true
|
20
|
+
s.summary = "Twitter text handling library"
|
21
|
+
|
22
|
+
s.add_dependency "action_view"
|
23
|
+
|
24
|
+
s.require_path = 'lib'
|
25
|
+
s.autorequire = ''
|
26
|
+
s.files = %w(LICENSE README.rdoc Rakefile TODO) + Dir.glob("{lib,spec}/**/*")
|
27
|
+
end
|
28
|
+
|
29
|
+
task :default => :spec
|
30
|
+
|
31
|
+
desc "Run specs"
|
32
|
+
Spec::Rake::SpecTask.new do |t|
|
33
|
+
t.spec_files = FileList['spec/**/*_spec.rb']
|
34
|
+
t.spec_opts = %w(-fs --color)
|
35
|
+
end
|
36
|
+
|
37
|
+
desc "Run all examples with RCov"
|
38
|
+
Spec::Rake::SpecTask.new('spec:rcov') do |t|
|
39
|
+
t.spec_files = FileList['spec/**/*.rb']
|
40
|
+
t.rcov = true
|
41
|
+
t.rcov_opts = ['--exclude', 'spec']
|
42
|
+
end
|
43
|
+
|
44
|
+
namespace :test do
|
45
|
+
namespace :conformance do
|
46
|
+
desc "Update conformance testing data"
|
47
|
+
task :update do
|
48
|
+
dir = File.join(File.dirname(__FILE__), "test", "twitter-text-conformance")
|
49
|
+
puts "Updating conformance data ... "
|
50
|
+
system("cd #{dir} && git pull origin master") || exit(1)
|
51
|
+
puts "Updating conformance data ... DONE"
|
52
|
+
end
|
53
|
+
|
54
|
+
desc "Run conformance test suite"
|
55
|
+
task :run do
|
56
|
+
ruby "test/conformance_test.rb"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
desc "Run conformance test suite"
|
61
|
+
task :conformance => ['conformance:update', 'conformance:run'] do
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
namespace :doc do
|
67
|
+
Rake::RDocTask.new do |rd|
|
68
|
+
rd.main = "README.rdoc"
|
69
|
+
rd.rdoc_dir = 'doc'
|
70
|
+
rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
75
|
+
pkg.gem_spec = spec
|
76
|
+
end
|
77
|
+
|
78
|
+
desc "install the gem locally"
|
79
|
+
task :install => [:package] do
|
80
|
+
sh %{sudo gem install pkg/#{GEM}-#{GEM_VERSION}}
|
81
|
+
end
|
82
|
+
|
83
|
+
desc "create a gemspec file"
|
84
|
+
task :make_spec do
|
85
|
+
File.open("#{GEM}.gemspec", "w") do |file|
|
86
|
+
file.puts spec.to_ruby
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
desc "runs cruise control build"
|
91
|
+
task :cruise => [:spec, 'test:conformance'] do
|
92
|
+
end
|
data/lib/autolink.rb
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
|
2
|
+
module Twitter
|
3
|
+
# A module for including Tweet auto-linking in a class. The primary use of this is for helpers/views so they can auto-link
|
4
|
+
# usernames, lists, hashtags and URLs.
|
5
|
+
module Autolink
|
6
|
+
include ActionView::Helpers::TagHelper #tag_options needed by auto_link
|
7
|
+
|
8
|
+
# Default CSS class for auto-linked URLs
|
9
|
+
DEFAULT_URL_CLASS = "tweet-url"
|
10
|
+
# Default CSS class for auto-linked lists (along with the url class)
|
11
|
+
DEFAULT_LIST_CLASS = "list-slug"
|
12
|
+
# Default CSS class for auto-linked usernames (along with the url class)
|
13
|
+
DEFAULT_USERNAME_CLASS = "username"
|
14
|
+
# Default CSS class for auto-linked hashtags (along with the url class)
|
15
|
+
DEFAULT_HASHTAG_CLASS = "hashtag"
|
16
|
+
|
17
|
+
# Add <tt><a></a></tt> tags around the usernames, lists, hashtags and URLs in the provided <tt>text</tt>. The
|
18
|
+
# <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
|
19
|
+
# hash:
|
20
|
+
#
|
21
|
+
# <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
|
22
|
+
# <tt>:list_class</tt>:: class to add to list <tt><a></tt> tags
|
23
|
+
# <tt>:username_class</tt>:: class to add to username <tt><a></tt> tags
|
24
|
+
# <tt>:hashtag_class</tt>:: class to add to hashtag <tt><a></tt> tags
|
25
|
+
# <tt>:username_url_base</tt>:: the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
|
26
|
+
# <tt>:list_url_base</tt>:: the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
|
27
|
+
# <tt>:hashtag_url_base</tt>:: the value for <tt>href</tt> attribute on hashtag links. The <tt>#hashtag</tt> (minus the <tt>#</tt>) will be appended at the end of this.
|
28
|
+
def auto_link(text, options = {}, &block) # :yields: hashtag_or_list_or_username
|
29
|
+
options = options.dup
|
30
|
+
auto_link_usernames_or_lists(auto_link_urls_custom(auto_link_hashtags(text), options, &block), &block)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Add <tt><a></a></tt> tags around the usernames and lists in the provided <tt>text</tt>. The
|
34
|
+
# <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
|
35
|
+
# hash:
|
36
|
+
#
|
37
|
+
# <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
|
38
|
+
# <tt>:list_class</tt>:: class to add to list <tt><a></tt> tags
|
39
|
+
# <tt>:username_class</tt>:: class to add to username <tt><a></tt> tags
|
40
|
+
# <tt>:username_url_base</tt>:: the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
|
41
|
+
# <tt>:list_url_base</tt>:: the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
|
42
|
+
def auto_link_usernames_or_lists(text, options = {}) # :yields: list_or_username
|
43
|
+
options = options.dup
|
44
|
+
options[:url_class] ||= DEFAULT_URL_CLASS
|
45
|
+
options[:list_class] ||= DEFAULT_LIST_CLASS
|
46
|
+
options[:username_class] ||= DEFAULT_USERNAME_CLASS
|
47
|
+
options[:username_url_base] ||= "http://twitter.com/"
|
48
|
+
options[:list_url_base] ||= "http://twitter.com/"
|
49
|
+
|
50
|
+
text.gsub(Twitter::Regex[:auto_link_usernames_or_lists]) do
|
51
|
+
if $4 && !options[:suppress_lists]
|
52
|
+
# the link is a list
|
53
|
+
text = list = "#{$3}#{$4}"
|
54
|
+
text = yield(list) if block_given?
|
55
|
+
"#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:list_class]}\" href=\"#{options[:list_url_base]}#{list.downcase}\">#{text}</a>"
|
56
|
+
else
|
57
|
+
# this is a screen name
|
58
|
+
text = $3
|
59
|
+
text = yield(text) if block_given?
|
60
|
+
"#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:username_class]}\" href=\"#{options[:username_url_base]}#{$3}\">#{text}</a>"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Add <tt><a></a></tt> tags around the hashtags in the provided <tt>text</tt>. The
|
66
|
+
# <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
|
67
|
+
# hash:
|
68
|
+
#
|
69
|
+
# <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
|
70
|
+
# <tt>:hashtag_class</tt>:: class to add to hashtag <tt><a></tt> tags
|
71
|
+
# <tt>:hashtag_url_base</tt>:: the value for <tt>href</tt> attribute. The hashtag text (minus the <tt>#</tt>) will be appended at the end of this.
|
72
|
+
#
|
73
|
+
def auto_link_hashtags(text, options = {}) # :yields: hashtag_text
|
74
|
+
options = options.dup
|
75
|
+
options[:url_class] ||= DEFAULT_URL_CLASS
|
76
|
+
options[:hashtag_class] ||= DEFAULT_HASHTAG_CLASS
|
77
|
+
options[:hashtag_url_base] ||= "http://twitter.com/search?q=%23"
|
78
|
+
|
79
|
+
text.gsub(Twitter::Regex[:auto_link_hashtags]) do
|
80
|
+
before = $1
|
81
|
+
hash = $2
|
82
|
+
text = $3
|
83
|
+
text = yield(text) if block_given?
|
84
|
+
"#{before}<a href=\"#{options[:hashtag_url_base]}#{text}\" title=\"##{text}\" class=\"#{options[:url_class]} #{options[:hashtag_class]}\">#{hash}#{text}</a>"
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Add <tt><a></a></tt> tags around the URLs in the provided <tt>text</tt>. Any
|
89
|
+
# elements in the <tt>href_options</tt> hash will be converted to HTML attributes
|
90
|
+
# and place in the <tt><a></tt> tag.
|
91
|
+
def auto_link_urls_custom(text, href_options = {})
|
92
|
+
text.gsub(Twitter::Regex[:valid_url]) do
|
93
|
+
all, before, url, protocol = $1, $2, $3, $4
|
94
|
+
options = tag_options(href_options.stringify_keys) || ""
|
95
|
+
full_url = (protocol == "www." ? "http://#{url}" : url)
|
96
|
+
"#{before}<a href=\"#{full_url}\"#{options}>#{url}</a>"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
end
|
data/lib/extractor.rb
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
|
2
|
+
module Twitter
|
3
|
+
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
4
|
+
# of usernames, lists, URLs and hashtags.
|
5
|
+
module Extractor
|
6
|
+
|
7
|
+
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
|
8
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
|
9
|
+
# will be returned.
|
10
|
+
#
|
11
|
+
# If a block is given then it will be called for each username.
|
12
|
+
def extract_mentioned_screen_names(text) # :yields: username
|
13
|
+
return [] unless text
|
14
|
+
|
15
|
+
possible_screen_names = []
|
16
|
+
text.scan(Twitter::Regex[:extract_mentions]) {|before,sn| possible_screen_names << sn }
|
17
|
+
possible_screen_names.each{|sn| yield sn } if block_given?
|
18
|
+
possible_screen_names
|
19
|
+
end
|
20
|
+
|
21
|
+
# Extracts the username username replied to in the Tweet <tt>text</tt>. If the
|
22
|
+
# <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
|
23
|
+
#
|
24
|
+
# If a block is given then it will be called with the username replied to (if any)
|
25
|
+
def extract_reply_screen_name(text) # :yields: username
|
26
|
+
return nil unless text
|
27
|
+
|
28
|
+
possible_screen_name = text.match(Twitter::Regex[:extract_reply])
|
29
|
+
return unless possible_screen_name.respond_to?(:captures)
|
30
|
+
screen_name = possible_screen_name.captures.first
|
31
|
+
yield screen_name if block_given?
|
32
|
+
screen_name
|
33
|
+
end
|
34
|
+
|
35
|
+
# Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
|
36
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
|
37
|
+
# will be returned.
|
38
|
+
#
|
39
|
+
# If a block is given then it will be called for each URL.
|
40
|
+
def extract_urls(text) # :yields: url
|
41
|
+
return [] unless text
|
42
|
+
|
43
|
+
urls = []
|
44
|
+
text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
45
|
+
urls << (protocol == "www." ? "http://#{url}" : url)
|
46
|
+
end
|
47
|
+
urls.each{|url| yield url } if block_given?
|
48
|
+
urls
|
49
|
+
end
|
50
|
+
|
51
|
+
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
52
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
|
53
|
+
# will be returned. The array returned will not include the leading <tt>#</tt>
|
54
|
+
# character.
|
55
|
+
#
|
56
|
+
# If a block is given then it will be called for each hashtag.
|
57
|
+
def extract_hashtags(text) # :yields: hashtag_text
|
58
|
+
return [] unless text
|
59
|
+
|
60
|
+
tags = []
|
61
|
+
text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
|
62
|
+
tags << hash_text
|
63
|
+
end
|
64
|
+
tags.each{|tag| yield tag } if block_given?
|
65
|
+
tags
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
data/lib/regex.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
|
2
|
+
module Twitter
|
3
|
+
# A collection of regular expressions for parsing Tweet text. The regular expression
|
4
|
+
# list is frozen at load time to ensure immutability. These reular expressions are
|
5
|
+
# used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
|
6
|
+
# sure these reular expressions work with Tweets in all languages.
|
7
|
+
class Regex
|
8
|
+
REGEXEN = {} # :nodoc:
|
9
|
+
|
10
|
+
# Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
|
11
|
+
# to access both the list of characters and a pattern suitible for use with String#split
|
12
|
+
# Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
|
13
|
+
UNICODE_SPACES = [
|
14
|
+
(0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
|
15
|
+
0x0020, # White_Space # Zs SPACE
|
16
|
+
0x0085, # White_Space # Cc <control-0085>
|
17
|
+
0x00A0, # White_Space # Zs NO-BREAK SPACE
|
18
|
+
0x1680, # White_Space # Zs OGHAM SPACE MARK
|
19
|
+
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
20
|
+
(0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
21
|
+
0x2028, # White_Space # Zl LINE SEPARATOR
|
22
|
+
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
|
23
|
+
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
|
24
|
+
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
25
|
+
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
|
26
|
+
].flatten.freeze
|
27
|
+
REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
|
28
|
+
|
29
|
+
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])[@@]([a-zA-Z0-9_]{1,20})/
|
30
|
+
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*[@@]([a-zA-Z0-9_]{1,20})/o
|
31
|
+
|
32
|
+
REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
|
33
|
+
|
34
|
+
# Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
|
35
|
+
LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
|
36
|
+
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
|
37
|
+
|
38
|
+
# Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
|
39
|
+
HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
|
40
|
+
REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
|
41
|
+
REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\x80-\xff\-]{0,79})?/
|
42
|
+
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
|
43
|
+
|
44
|
+
# URL related hash regex collection
|
45
|
+
REGEXEN[:valid_preceeding_chars] = /(?:[^\/"':!=]|^)/
|
46
|
+
REGEXEN[:valid_domain] = /[a-z0-9\.-]+\.[a-z]{2,}(?::[0-9]+)?/i
|
47
|
+
REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:@&=\+\$\/%#\[\]\-_\.,~]/i
|
48
|
+
# Valid end-of-path chracters (so /foo. does not gobble the period).
|
49
|
+
# 1. Allow ) for Wikipedia URLs.
|
50
|
+
# 2. Allow =&# for empty URL parameters and other URL-join artifacts
|
51
|
+
REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
|
52
|
+
REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:@&=\+\$\/%#\[\]\-_\.,~]/i
|
53
|
+
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
|
54
|
+
REGEXEN[:valid_url] = %r{
|
55
|
+
( # $1 total match
|
56
|
+
(#{REGEXEN[:valid_preceeding_chars]}) # $2 Preceeding chracter
|
57
|
+
( # $3 URL
|
58
|
+
(https?:\/\/|www\.) # $4 Protocol or beginning
|
59
|
+
(#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
|
60
|
+
(/#{REGEXEN[:valid_url_path_chars]}*#{REGEXEN[:valid_url_path_ending_chars]}?)? # $6 URL Path
|
61
|
+
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $7 Query String
|
62
|
+
)
|
63
|
+
)
|
64
|
+
}iox;
|
65
|
+
|
66
|
+
REGEXEN.each_pair{|k,v| v.freeze }
|
67
|
+
|
68
|
+
# Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
|
69
|
+
# is not a known symbol a <tt>nil</tt> will be returned.
|
70
|
+
def self.[](key)
|
71
|
+
REGEXEN[key]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
data/lib/twitter-text.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
|
2
|
+
raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
|
6
|
+
# Needed for auto-linking
|
7
|
+
require 'action_view'
|
8
|
+
|
9
|
+
require File.join(File.dirname(__FILE__), 'regex')
|
10
|
+
require File.join(File.dirname(__FILE__), 'autolink')
|
11
|
+
require File.join(File.dirname(__FILE__), 'extractor')
|
12
|
+
require File.join(File.dirname(__FILE__), 'unicode')
|
13
|
+
require File.join(File.dirname(__FILE__), 'validation')
|
data/lib/unicode.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
module Twitter
|
3
|
+
# This module lazily defines constants of the form Uxxxx for all Unicode
|
4
|
+
# codepoints from U0000 to U10FFFF. The value of each constant is the
|
5
|
+
# UTF-8 string for the codepoint.
|
6
|
+
# Examples:
|
7
|
+
# copyright = Unicode::U00A9
|
8
|
+
# euro = Unicode::U20AC
|
9
|
+
# infinity = Unicode::U221E
|
10
|
+
#
|
11
|
+
module Unicode
|
12
|
+
CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
|
13
|
+
|
14
|
+
def self.const_missing(name)
|
15
|
+
# Check that the constant name is of the right form: U0000 to U10FFFF
|
16
|
+
if name.to_s =~ CODEPOINT_REGEX
|
17
|
+
# Convert the codepoint to an immutable UTF-8 string,
|
18
|
+
# define a real constant for that value and return the value
|
19
|
+
#p name, name.class
|
20
|
+
const_set(name, [$1.to_i(16)].pack("U").freeze)
|
21
|
+
else # Raise an error for constants that are not Unicode.
|
22
|
+
raise NameError, "Uninitialized constant: Unicode::#{name}"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|