redaranj-twitter-text 1.0.4.191
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +13 -0
- data/README.rdoc +66 -0
- data/Rakefile +115 -0
- data/TODO +3 -0
- data/lib/autolink.rb +118 -0
- data/lib/extractor.rb +70 -0
- data/lib/regex.rb +75 -0
- data/lib/twitter-text.rb +13 -0
- data/lib/unicode.rb +27 -0
- data/lib/validation.rb +52 -0
- data/spec/autolinking_spec.rb +437 -0
- data/spec/extractor_spec.rb +175 -0
- data/spec/regex_spec.rb +24 -0
- data/spec/spec_helper.rb +96 -0
- data/spec/test_urls.rb +31 -0
- data/spec/unicode_spec.rb +31 -0
- data/spec/validation_spec.rb +43 -0
- metadata +93 -0
data/LICENSE
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Copyright 2010 Twitter, Inc.
|
2
|
+
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
4
|
+
use this file except in compliance with the License. You may obtain a copy of
|
5
|
+
the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
11
|
+
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
12
|
+
License for the specific language governing permissions and limitations under
|
13
|
+
the License.
|
data/README.rdoc
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
== twitter-text
|
2
|
+
|
3
|
+
A gem that provides text processing routines for Twitter Tweets. The major
|
4
|
+
reason for this is to unify the various auto-linking and extraction of
|
5
|
+
usernames, lists, hashtags and URLs.
|
6
|
+
|
7
|
+
== Extraction Examples
|
8
|
+
|
9
|
+
# Extraction
|
10
|
+
class MyClass
|
11
|
+
include Twitter::Extractor
|
12
|
+
usernames = extract_mentioned_screen_names("Mentioning @twitter and @jack")
|
13
|
+
# usernames = ["twitter", "jack"]
|
14
|
+
end
|
15
|
+
|
16
|
+
# Extraction with a block argument
|
17
|
+
class MyClass
|
18
|
+
include Twitter::Extractor
|
19
|
+
extract_reply_screen_name("@twitter are you hiring?").do |username|
|
20
|
+
# username = "twitter"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
== Auto-linking Examples
|
25
|
+
|
26
|
+
# Auto-link
|
27
|
+
class MyClass
|
28
|
+
include Twitter::Autolink
|
29
|
+
|
30
|
+
html = auto_link("link @user, please #request")
|
31
|
+
end
|
32
|
+
|
33
|
+
=== Usernames
|
34
|
+
|
35
|
+
Username extraction and linking matches all valid Twitter usernames but does
|
36
|
+
not verify that the username is a valid Twitter account.
|
37
|
+
|
38
|
+
=== Lists
|
39
|
+
|
40
|
+
Auto-link and extract list names when they are written in @user/list-name
|
41
|
+
format.
|
42
|
+
|
43
|
+
=== Hashtags
|
44
|
+
|
45
|
+
Auto-link and extract hashtags, where a hashtag contains any latin letter or
|
46
|
+
number but cannot be solely numbers.
|
47
|
+
|
48
|
+
=== URLs
|
49
|
+
|
50
|
+
Auto-linking and extraction of URLs differs from the Rails default so that it
|
51
|
+
will work correctly in Tweets written in languages that do not include spaces
|
52
|
+
between words.
|
53
|
+
|
54
|
+
=== International
|
55
|
+
|
56
|
+
Special care has been taken to be sure that auto-linking and extraction work
|
57
|
+
in Tweets of all languages. This means that languages without spaces between
|
58
|
+
words should work equally well.
|
59
|
+
|
60
|
+
=== Conformance
|
61
|
+
|
62
|
+
To run the Conformance suite, you'll need to add that project as a git submodule. From the root twitter-text-rb directory, run:
|
63
|
+
|
64
|
+
git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
|
65
|
+
git submodule init
|
66
|
+
git submodule update
|
data/Rakefile
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'rubygems' unless ENV['NO_RUBYGEMS']
|
2
|
+
require 'rake/gempackagetask'
|
3
|
+
require 'rake/rdoctask'
|
4
|
+
require 'rubygems/specification'
|
5
|
+
require 'date'
|
6
|
+
require 'spec/rake/spectask'
|
7
|
+
require 'spec/rake/verify_rcov'
|
8
|
+
require 'digest'
|
9
|
+
|
10
|
+
spec = Gem::Specification.new do |s|
|
11
|
+
s.name = "twitter-text"
|
12
|
+
s.version = "1.0.4"
|
13
|
+
s.authors = ["Matt Sanford", "Patrick Ewing"]
|
14
|
+
s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com"]
|
15
|
+
s.homepage = "http://twitter.com"
|
16
|
+
s.description = s.summary = "A gem that provides text handling for Twitter"
|
17
|
+
|
18
|
+
s.platform = Gem::Platform::RUBY
|
19
|
+
s.has_rdoc = true
|
20
|
+
s.summary = "Twitter text handling library"
|
21
|
+
|
22
|
+
s.add_dependency "actionpack"
|
23
|
+
|
24
|
+
s.require_path = 'lib'
|
25
|
+
s.autorequire = ''
|
26
|
+
s.files = %w(LICENSE README.rdoc Rakefile TODO) + Dir.glob("{lib,spec}/**/*")
|
27
|
+
end
|
28
|
+
|
29
|
+
task :default => :spec
|
30
|
+
|
31
|
+
desc "Run specs"
|
32
|
+
Spec::Rake::SpecTask.new do |t|
|
33
|
+
t.spec_files = FileList['spec/**/*_spec.rb']
|
34
|
+
t.spec_opts = %w(-fs --color)
|
35
|
+
end
|
36
|
+
|
37
|
+
desc "Run all examples with RCov"
|
38
|
+
Spec::Rake::SpecTask.new('spec:rcov') do |t|
|
39
|
+
t.spec_files = FileList['spec/**/*.rb']
|
40
|
+
t.rcov = true
|
41
|
+
t.rcov_opts = ['--exclude', 'spec']
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
def conformance_version(dir)
|
46
|
+
Dir[File.join(dir, '*')].inject(Digest::SHA1.new){|digest, file| digest.update(Digest::SHA1.file(file).hexdigest) }
|
47
|
+
end
|
48
|
+
|
49
|
+
namespace :test do
|
50
|
+
namespace :conformance do
|
51
|
+
|
52
|
+
|
53
|
+
desc "Update conformance testing data"
|
54
|
+
task :update do
|
55
|
+
puts "Updating conformance data ... "
|
56
|
+
system("git submodule init") || raise("Failed to init submodule")
|
57
|
+
system("git submodule update") || raise("Failed to update submodule")
|
58
|
+
puts "Updating conformance data ... DONE"
|
59
|
+
end
|
60
|
+
|
61
|
+
desc "Change conformance test data to the lastest version"
|
62
|
+
task :latest => ['conformance:update'] do
|
63
|
+
current_dir = File.dirname(__FILE__)
|
64
|
+
submodule_dir = File.join(File.dirname(__FILE__), "test", "twitter-text-conformance")
|
65
|
+
version_before = conformance_version(submodule_dir)
|
66
|
+
system("cd #{submodule_dir} && git pull origin master") || raise("Failed to pull submodule version")
|
67
|
+
system("cd #{current_dir}")
|
68
|
+
if conformance_version(submodule_dir) != version_before
|
69
|
+
system("cd #{current_dir} && git add #{submodule_dir}") || raise("Failed to add upgrade files")
|
70
|
+
system("git commit -m \"Upgraded to the latest conformance suite\" #{submodule_dir}") || raise("Failed to commit upgraded conformacne data")
|
71
|
+
puts "Upgraded conformance suite."
|
72
|
+
else
|
73
|
+
puts "No conformance suite changes."
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
desc "Run conformance test suite"
|
78
|
+
task :run do
|
79
|
+
ruby "test/conformance_test.rb"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
desc "Run conformance test suite"
|
84
|
+
task :conformance => ['conformance:latest', 'conformance:run'] do
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
namespace :doc do
|
90
|
+
Rake::RDocTask.new do |rd|
|
91
|
+
rd.main = "README.rdoc"
|
92
|
+
rd.rdoc_dir = 'doc'
|
93
|
+
rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
98
|
+
pkg.gem_spec = spec
|
99
|
+
end
|
100
|
+
|
101
|
+
desc "install the gem locally"
|
102
|
+
task :install => [:package] do
|
103
|
+
sh %{sudo gem install pkg/#{GEM}-#{GEM_VERSION}}
|
104
|
+
end
|
105
|
+
|
106
|
+
desc "create a gemspec file"
|
107
|
+
task :make_spec do
|
108
|
+
File.open("#{GEM}.gemspec", "w") do |file|
|
109
|
+
file.puts spec.to_ruby
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
desc "runs cruise control build"
|
114
|
+
task :cruise => [:spec, 'test:conformance'] do
|
115
|
+
end
|
data/lib/autolink.rb
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Twitter
|
3
|
+
# A module for including Tweet auto-linking in a class. The primary use of this is for helpers/views so they can auto-link
|
4
|
+
# usernames, lists, hashtags and URLs.
|
5
|
+
module Autolink
|
6
|
+
include ActionView::Helpers::TagHelper #tag_options needed by auto_link
|
7
|
+
|
8
|
+
WWW_REGEX = /www\./i #:nodoc:
|
9
|
+
|
10
|
+
# Default CSS class for auto-linked URLs
|
11
|
+
DEFAULT_URL_CLASS = "tweet-url"
|
12
|
+
# Default CSS class for auto-linked lists (along with the url class)
|
13
|
+
DEFAULT_LIST_CLASS = "list-slug"
|
14
|
+
# Default CSS class for auto-linked usernames (along with the url class)
|
15
|
+
DEFAULT_USERNAME_CLASS = "username"
|
16
|
+
# Default CSS class for auto-linked hashtags (along with the url class)
|
17
|
+
DEFAULT_HASHTAG_CLASS = "hashtag"
|
18
|
+
# HTML attribute for robot nofollow behavior (default)
|
19
|
+
HTML_ATTR_NO_FOLLOW = " rel=\"nofollow\""
|
20
|
+
|
21
|
+
# Add <tt><a></a></tt> tags around the usernames, lists, hashtags and URLs in the provided <tt>text</tt>. The
|
22
|
+
# <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
|
23
|
+
# hash:
|
24
|
+
#
|
25
|
+
# <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
|
26
|
+
# <tt>:list_class</tt>:: class to add to list <tt><a></tt> tags
|
27
|
+
# <tt>:username_class</tt>:: class to add to username <tt><a></tt> tags
|
28
|
+
# <tt>:hashtag_class</tt>:: class to add to hashtag <tt><a></tt> tags
|
29
|
+
# <tt>:username_url_base</tt>:: the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
|
30
|
+
# <tt>:list_url_base</tt>:: the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
|
31
|
+
# <tt>:hashtag_url_base</tt>:: the value for <tt>href</tt> attribute on hashtag links. The <tt>#hashtag</tt> (minus the <tt>#</tt>) will be appended at the end of this.
|
32
|
+
# <tt>:suppress_lists</tt>:: disable auto-linking to lists
|
33
|
+
# <tt>:suppress_no_follow</tt>:: Do not add <tt>rel="nofollow"</tt> to auto-linked items
|
34
|
+
def auto_link(text, options = {})
|
35
|
+
auto_link_usernames_or_lists(
|
36
|
+
auto_link_urls_custom(
|
37
|
+
auto_link_hashtags(text, options),
|
38
|
+
options),
|
39
|
+
options)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Add <tt><a></a></tt> tags around the usernames and lists in the provided <tt>text</tt>. The
|
43
|
+
# <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
|
44
|
+
# hash:
|
45
|
+
#
|
46
|
+
# <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
|
47
|
+
# <tt>:list_class</tt>:: class to add to list <tt><a></tt> tags
|
48
|
+
# <tt>:username_class</tt>:: class to add to username <tt><a></tt> tags
|
49
|
+
# <tt>:username_url_base</tt>:: the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
|
50
|
+
# <tt>:list_url_base</tt>:: the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
|
51
|
+
# <tt>:suppress_lists</tt>:: disable auto-linking to lists
|
52
|
+
# <tt>:suppress_no_follow</tt>:: Do not add <tt>rel="nofollow"</tt> to auto-linked items
|
53
|
+
def auto_link_usernames_or_lists(text, options = {}) # :yields: list_or_username
|
54
|
+
options = options.dup
|
55
|
+
options[:url_class] ||= DEFAULT_URL_CLASS
|
56
|
+
options[:list_class] ||= DEFAULT_LIST_CLASS
|
57
|
+
options[:username_class] ||= DEFAULT_USERNAME_CLASS
|
58
|
+
options[:username_url_base] ||= "http://twitter.com/"
|
59
|
+
options[:list_url_base] ||= "http://twitter.com/"
|
60
|
+
extra_html = HTML_ATTR_NO_FOLLOW unless options[:suppress_no_follow]
|
61
|
+
|
62
|
+
text.gsub(Twitter::Regex[:auto_link_usernames_or_lists]) do
|
63
|
+
if $4 && !options[:suppress_lists]
|
64
|
+
# the link is a list
|
65
|
+
text = list = "#{$3}#{$4}"
|
66
|
+
text = yield(list) if block_given?
|
67
|
+
"#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:list_class]}\" href=\"#{options[:list_url_base]}#{list.downcase}\"#{extra_html}>#{text}</a>"
|
68
|
+
else
|
69
|
+
# this is a screen name
|
70
|
+
text = $3
|
71
|
+
text = yield(text) if block_given?
|
72
|
+
"#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:username_class]}\" href=\"#{options[:username_url_base]}#{text}\"#{extra_html}>#{text}</a>"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Add <tt><a></a></tt> tags around the hashtags in the provided <tt>text</tt>. The
|
78
|
+
# <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
|
79
|
+
# hash:
|
80
|
+
#
|
81
|
+
# <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
|
82
|
+
# <tt>:hashtag_class</tt>:: class to add to hashtag <tt><a></tt> tags
|
83
|
+
# <tt>:hashtag_url_base</tt>:: the value for <tt>href</tt> attribute. The hashtag text (minus the <tt>#</tt>) will be appended at the end of this.
|
84
|
+
# <tt>:suppress_no_follow</tt>:: Do not add <tt>rel="nofollow"</tt> to auto-linked items
|
85
|
+
def auto_link_hashtags(text, options = {}) # :yields: hashtag_text
|
86
|
+
options = options.dup
|
87
|
+
options[:url_class] ||= DEFAULT_URL_CLASS
|
88
|
+
options[:hashtag_class] ||= DEFAULT_HASHTAG_CLASS
|
89
|
+
options[:hashtag_url_base] ||= "http://twitter.com/search?q=%23"
|
90
|
+
extra_html = HTML_ATTR_NO_FOLLOW unless options[:suppress_no_follow]
|
91
|
+
|
92
|
+
text.gsub(Twitter::Regex[:auto_link_hashtags]) do
|
93
|
+
before = $1
|
94
|
+
hash = $2
|
95
|
+
text = $3
|
96
|
+
text = yield(text) if block_given?
|
97
|
+
"#{before}<a href=\"#{options[:hashtag_url_base]}#{text}\" title=\"##{text}\" class=\"#{options[:url_class]} #{options[:hashtag_class]}\"#{extra_html}>#{hash}#{text}</a>"
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Add <tt><a></a></tt> tags around the URLs in the provided <tt>text</tt>. Any
|
102
|
+
# elements in the <tt>href_options</tt> hash will be converted to HTML attributes
|
103
|
+
# and place in the <tt><a></tt> tag. Unless <tt>href_options</tt> contains <tt>:suppress_no_follow</tt>
|
104
|
+
# the <tt>rel="nofollow"</tt> attribute will be added.
|
105
|
+
def auto_link_urls_custom(text, href_options = {})
|
106
|
+
options = href_options.dup
|
107
|
+
options[:rel] = "nofollow" unless options.delete(:suppress_no_follow)
|
108
|
+
|
109
|
+
text.gsub(Twitter::Regex[:valid_url]) do
|
110
|
+
all, before, url, protocol = $1, $2, $3, $4
|
111
|
+
html_attrs = tag_options(options.stringify_keys) || ""
|
112
|
+
full_url = (protocol =~ WWW_REGEX ? "http://#{url}" : url)
|
113
|
+
"#{before}<a href=\"#{full_url}\"#{html_attrs}>#{url}</a>"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
end
|
data/lib/extractor.rb
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Twitter
|
3
|
+
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
4
|
+
# of usernames, lists, URLs and hashtags.
|
5
|
+
module Extractor
|
6
|
+
|
7
|
+
# Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
|
8
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
|
9
|
+
# will be returned.
|
10
|
+
#
|
11
|
+
# If a block is given then it will be called for each username.
|
12
|
+
def extract_mentioned_screen_names(text) # :yields: username
|
13
|
+
return [] unless text
|
14
|
+
|
15
|
+
possible_screen_names = []
|
16
|
+
text.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
|
17
|
+
possible_screen_names << sn unless after =~ Twitter::Regex[:at_signs]
|
18
|
+
end
|
19
|
+
possible_screen_names.each{|sn| yield sn } if block_given?
|
20
|
+
possible_screen_names
|
21
|
+
end
|
22
|
+
|
23
|
+
# Extracts the username username replied to in the Tweet <tt>text</tt>. If the
|
24
|
+
# <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
|
25
|
+
#
|
26
|
+
# If a block is given then it will be called with the username replied to (if any)
|
27
|
+
def extract_reply_screen_name(text) # :yields: username
|
28
|
+
return nil unless text
|
29
|
+
|
30
|
+
possible_screen_name = text.match(Twitter::Regex[:extract_reply])
|
31
|
+
return unless possible_screen_name.respond_to?(:captures)
|
32
|
+
screen_name = possible_screen_name.captures.first
|
33
|
+
yield screen_name if block_given?
|
34
|
+
screen_name
|
35
|
+
end
|
36
|
+
|
37
|
+
# Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
|
38
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
|
39
|
+
# will be returned.
|
40
|
+
#
|
41
|
+
# If a block is given then it will be called for each URL.
|
42
|
+
def extract_urls(text) # :yields: url
|
43
|
+
return [] unless text
|
44
|
+
urls = []
|
45
|
+
text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
46
|
+
urls << (protocol == "www." ? "http://#{url}" : url)
|
47
|
+
end
|
48
|
+
urls.each{|url| yield url } if block_given?
|
49
|
+
urls
|
50
|
+
end
|
51
|
+
|
52
|
+
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
53
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
|
54
|
+
# will be returned. The array returned will not include the leading <tt>#</tt>
|
55
|
+
# character.
|
56
|
+
#
|
57
|
+
# If a block is given then it will be called for each hashtag.
|
58
|
+
def extract_hashtags(text) # :yields: hashtag_text
|
59
|
+
return [] unless text
|
60
|
+
|
61
|
+
tags = []
|
62
|
+
text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
|
63
|
+
tags << hash_text
|
64
|
+
end
|
65
|
+
tags.each{|tag| yield tag } if block_given?
|
66
|
+
tags
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
data/lib/regex.rb
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Twitter
|
3
|
+
# A collection of regular expressions for parsing Tweet text. The regular expression
|
4
|
+
# list is frozen at load time to ensure immutability. These reular expressions are
|
5
|
+
# used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
|
6
|
+
# sure these reular expressions work with Tweets in all languages.
|
7
|
+
class Regex
|
8
|
+
REGEXEN = {} # :nodoc:
|
9
|
+
|
10
|
+
# Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
|
11
|
+
# to access both the list of characters and a pattern suitible for use with String#split
|
12
|
+
# Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
|
13
|
+
UNICODE_SPACES = [
|
14
|
+
(0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
|
15
|
+
0x0020, # White_Space # Zs SPACE
|
16
|
+
0x0085, # White_Space # Cc <control-0085>
|
17
|
+
0x00A0, # White_Space # Zs NO-BREAK SPACE
|
18
|
+
0x1680, # White_Space # Zs OGHAM SPACE MARK
|
19
|
+
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
|
20
|
+
(0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
|
21
|
+
0x2028, # White_Space # Zl LINE SEPARATOR
|
22
|
+
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
|
23
|
+
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
|
24
|
+
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
|
25
|
+
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
|
26
|
+
].flatten.freeze
|
27
|
+
REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
|
28
|
+
|
29
|
+
REGEXEN[:at_signs] = /[@@]/
|
30
|
+
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
|
31
|
+
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
32
|
+
|
33
|
+
REGEXEN[:list_name] = /^[a-zA-Z].{0,79}$/
|
34
|
+
|
35
|
+
# Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
|
36
|
+
LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
|
37
|
+
REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
|
38
|
+
|
39
|
+
# Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
|
40
|
+
HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
|
41
|
+
REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
|
42
|
+
REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\-]{0,79})?/
|
43
|
+
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
|
44
|
+
|
45
|
+
# URL related hash regex collection
|
46
|
+
REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
|
47
|
+
REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-][^[:punct:]\s]|[^[:punct:]\s])+\.[a-z]{2,}(?::[0-9]+)?/i
|
48
|
+
REGEXEN[:valid_url_path_chars] = /[\.\,]?[a-z0-9!\*'\(\);:=\+\$\/%#\[\]\-_,~@]/i
|
49
|
+
# Valid end-of-path chracters (so /foo. does not gobble the period).
|
50
|
+
# 1. Allow ) for Wikipedia URLs.
|
51
|
+
# 2. Allow =&# for empty URL parameters and other URL-join artifacts
|
52
|
+
REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
|
53
|
+
REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
|
54
|
+
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
|
55
|
+
REGEXEN[:valid_url] = %r{
|
56
|
+
( # $1 total match
|
57
|
+
(#{REGEXEN[:valid_preceding_chars]}) # $2 Preceeding chracter
|
58
|
+
( # $3 URL
|
59
|
+
(https?:\/\/|www\.) # $4 Protocol or beginning
|
60
|
+
(#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
|
61
|
+
(/#{REGEXEN[:valid_url_path_chars]}*#{REGEXEN[:valid_url_path_ending_chars]}?)? # $6 URL Path
|
62
|
+
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $7 Query String
|
63
|
+
)
|
64
|
+
)
|
65
|
+
}iox;
|
66
|
+
|
67
|
+
REGEXEN.each_pair{|k,v| v.freeze }
|
68
|
+
|
69
|
+
# Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
|
70
|
+
# is not a known symbol a <tt>nil</tt> will be returned.
|
71
|
+
def self.[](key)
|
72
|
+
REGEXEN[key]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|