twitter-text 1.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,13 @@
1
+ Copyright 2010 Twitter, Inc.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not
4
+ use this file except in compliance with the License. You may obtain a copy of
5
+ the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11
+ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12
+ License for the specific language governing permissions and limitations under
13
+ the License.
data/README.rdoc ADDED
@@ -0,0 +1,58 @@
1
+ == twitter-text
2
+
3
+ A gem that provides text processing routines for Twitter Tweets. The major
4
+ reason for this is to unify the various auto-linking and extraction of
5
+ usernames, lists, hashtags and URLs.
6
+
7
+ == Extraction Examples
8
+
9
+ # Extraction
10
+ class MyClass
11
+ include Twitter::Extractor
12
+ usernames = extract_mentioned_screen_names("Mentioning @twitter and @jack")
13
+ # usernames = ["twitter", "jack"]
14
+ end
15
+
16
+ # Extraction with a block argument
17
+ class MyClass
18
+ include Twitter::Extractor
19
+ extract_reply_screen_name("@twitter are you hiring?").do |username|
20
+ # username = "twitter"
21
+ end
22
+ end
23
+
24
+ == Auto-linking Examples
25
+
26
+ # Auto-link
27
+ class MyClass
28
+ include Twitter::Autolink
29
+
30
+ html = auto_link("link @user, please #request")
31
+ end
32
+
33
+ === Usernames
34
+
35
+ Username extraction and linking matches all valid Twitter usernames but does
36
+ not verify that the username is a valid Twitter account.
37
+
38
+ === Lists
39
+
40
+ Auto-link and extract list names when they are written in @user/list-name
41
+ format.
42
+
43
+ === Hashtags
44
+
45
+ Auto-link and extract hashtags, where a hashtag contains any latin letter or
46
+ number but cannot be solely numbers.
47
+
48
+ === URLs
49
+
50
+ Auto-linking and extraction of URLs differs from the Rails default so that it
51
+ will work correctly in Tweets written in languages that do not include spaces
52
+ between words.
53
+
54
+ === International
55
+
56
+ Special care has been taken to be sure that auto-linking and extraction work
57
+ in Tweets of all languages. This means that languages without spaces between
58
+ words should work equally well.
data/Rakefile ADDED
@@ -0,0 +1,92 @@
1
+ require 'rubygems' unless ENV['NO_RUBYGEMS']
2
+ require 'rake/gempackagetask'
3
+ require 'rake/rdoctask'
4
+ require 'rubygems/specification'
5
+ require 'date'
6
+ require 'spec/rake/spectask'
7
+ require 'spec/rake/verify_rcov'
8
+
9
+
10
+ spec = Gem::Specification.new do |s|
11
+ s.name = "twitter-text"
12
+ s.version = "1.0"
13
+ s.author = "Matt Sanford"
14
+ s.email = "matt@twitter.com"
15
+ s.homepage = "http://twitter.com"
16
+ s.description = s.summary = "A gem that provides text handling for Twitter"
17
+
18
+ s.platform = Gem::Platform::RUBY
19
+ s.has_rdoc = true
20
+ s.summary = "Twitter text handling library"
21
+
22
+ s.add_dependency "action_view"
23
+
24
+ s.require_path = 'lib'
25
+ s.autorequire = ''
26
+ s.files = %w(LICENSE README.rdoc Rakefile TODO) + Dir.glob("{lib,spec}/**/*")
27
+ end
28
+
29
+ task :default => :spec
30
+
31
+ desc "Run specs"
32
+ Spec::Rake::SpecTask.new do |t|
33
+ t.spec_files = FileList['spec/**/*_spec.rb']
34
+ t.spec_opts = %w(-fs --color)
35
+ end
36
+
37
+ desc "Run all examples with RCov"
38
+ Spec::Rake::SpecTask.new('spec:rcov') do |t|
39
+ t.spec_files = FileList['spec/**/*.rb']
40
+ t.rcov = true
41
+ t.rcov_opts = ['--exclude', 'spec']
42
+ end
43
+
44
+ namespace :test do
45
+ namespace :conformance do
46
+ desc "Update conformance testing data"
47
+ task :update do
48
+ dir = File.join(File.dirname(__FILE__), "test", "twitter-text-conformance")
49
+ puts "Updating conformance data ... "
50
+ system("cd #{dir} && git pull origin master") || exit(1)
51
+ puts "Updating conformance data ... DONE"
52
+ end
53
+
54
+ desc "Run conformance test suite"
55
+ task :run do
56
+ ruby "test/conformance_test.rb"
57
+ end
58
+ end
59
+
60
+ desc "Run conformance test suite"
61
+ task :conformance => ['conformance:update', 'conformance:run'] do
62
+ end
63
+ end
64
+
65
+
66
+ namespace :doc do
67
+ Rake::RDocTask.new do |rd|
68
+ rd.main = "README.rdoc"
69
+ rd.rdoc_dir = 'doc'
70
+ rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
71
+ end
72
+ end
73
+
74
+ Rake::GemPackageTask.new(spec) do |pkg|
75
+ pkg.gem_spec = spec
76
+ end
77
+
78
+ desc "install the gem locally"
79
+ task :install => [:package] do
80
+ sh %{sudo gem install pkg/#{GEM}-#{GEM_VERSION}}
81
+ end
82
+
83
+ desc "create a gemspec file"
84
+ task :make_spec do
85
+ File.open("#{GEM}.gemspec", "w") do |file|
86
+ file.puts spec.to_ruby
87
+ end
88
+ end
89
+
90
+ desc "runs cruise control build"
91
+ task :cruise => [:spec, 'test:conformance'] do
92
+ end
data/TODO ADDED
@@ -0,0 +1,3 @@
1
+ TODO:
2
+
3
+ * More tests
data/lib/autolink.rb ADDED
@@ -0,0 +1,101 @@
1
+
2
+ module Twitter
3
+ # A module for including Tweet auto-linking in a class. The primary use of this is for helpers/views so they can auto-link
4
+ # usernames, lists, hashtags and URLs.
5
+ module Autolink
6
+ include ActionView::Helpers::TagHelper #tag_options needed by auto_link
7
+
8
+ # Default CSS class for auto-linked URLs
9
+ DEFAULT_URL_CLASS = "tweet-url"
10
+ # Default CSS class for auto-linked lists (along with the url class)
11
+ DEFAULT_LIST_CLASS = "list-slug"
12
+ # Default CSS class for auto-linked usernames (along with the url class)
13
+ DEFAULT_USERNAME_CLASS = "username"
14
+ # Default CSS class for auto-linked hashtags (along with the url class)
15
+ DEFAULT_HASHTAG_CLASS = "hashtag"
16
+
17
+ # Add <tt><a></a></tt> tags around the usernames, lists, hashtags and URLs in the provided <tt>text</tt>. The
18
+ # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
19
+ # hash:
20
+ #
21
+ # <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
22
+ # <tt>:list_class</tt>:: class to add to list <tt><a></tt> tags
23
+ # <tt>:username_class</tt>:: class to add to username <tt><a></tt> tags
24
+ # <tt>:hashtag_class</tt>:: class to add to hashtag <tt><a></tt> tags
25
+ # <tt>:username_url_base</tt>:: the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
26
+ # <tt>:list_url_base</tt>:: the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
27
+ # <tt>:hashtag_url_base</tt>:: the value for <tt>href</tt> attribute on hashtag links. The <tt>#hashtag</tt> (minus the <tt>#</tt>) will be appended at the end of this.
28
+ def auto_link(text, options = {}, &block) # :yields: hashtag_or_list_or_username
29
+ options = options.dup
30
+ auto_link_usernames_or_lists(auto_link_urls_custom(auto_link_hashtags(text), options, &block), &block)
31
+ end
32
+
33
+ # Add <tt><a></a></tt> tags around the usernames and lists in the provided <tt>text</tt>. The
34
+ # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
35
+ # hash:
36
+ #
37
+ # <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
38
+ # <tt>:list_class</tt>:: class to add to list <tt><a></tt> tags
39
+ # <tt>:username_class</tt>:: class to add to username <tt><a></tt> tags
40
+ # <tt>:username_url_base</tt>:: the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
41
+ # <tt>:list_url_base</tt>:: the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
42
+ def auto_link_usernames_or_lists(text, options = {}) # :yields: list_or_username
43
+ options = options.dup
44
+ options[:url_class] ||= DEFAULT_URL_CLASS
45
+ options[:list_class] ||= DEFAULT_LIST_CLASS
46
+ options[:username_class] ||= DEFAULT_USERNAME_CLASS
47
+ options[:username_url_base] ||= "http://twitter.com/"
48
+ options[:list_url_base] ||= "http://twitter.com/"
49
+
50
+ text.gsub(Twitter::Regex[:auto_link_usernames_or_lists]) do
51
+ if $4 && !options[:suppress_lists]
52
+ # the link is a list
53
+ text = list = "#{$3}#{$4}"
54
+ text = yield(list) if block_given?
55
+ "#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:list_class]}\" href=\"#{options[:list_url_base]}#{list.downcase}\">#{text}</a>"
56
+ else
57
+ # this is a screen name
58
+ text = $3
59
+ text = yield(text) if block_given?
60
+ "#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:username_class]}\" href=\"#{options[:username_url_base]}#{$3}\">#{text}</a>"
61
+ end
62
+ end
63
+ end
64
+
65
+ # Add <tt><a></a></tt> tags around the hashtags in the provided <tt>text</tt>. The
66
+ # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
67
+ # hash:
68
+ #
69
+ # <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
70
+ # <tt>:hashtag_class</tt>:: class to add to hashtag <tt><a></tt> tags
71
+ # <tt>:hashtag_url_base</tt>:: the value for <tt>href</tt> attribute. The hashtag text (minus the <tt>#</tt>) will be appended at the end of this.
72
+ #
73
+ def auto_link_hashtags(text, options = {}) # :yields: hashtag_text
74
+ options = options.dup
75
+ options[:url_class] ||= DEFAULT_URL_CLASS
76
+ options[:hashtag_class] ||= DEFAULT_HASHTAG_CLASS
77
+ options[:hashtag_url_base] ||= "http://twitter.com/search?q=%23"
78
+
79
+ text.gsub(Twitter::Regex[:auto_link_hashtags]) do
80
+ before = $1
81
+ hash = $2
82
+ text = $3
83
+ text = yield(text) if block_given?
84
+ "#{before}<a href=\"#{options[:hashtag_url_base]}#{text}\" title=\"##{text}\" class=\"#{options[:url_class]} #{options[:hashtag_class]}\">#{hash}#{text}</a>"
85
+ end
86
+ end
87
+
88
+ # Add <tt><a></a></tt> tags around the URLs in the provided <tt>text</tt>. Any
89
+ # elements in the <tt>href_options</tt> hash will be converted to HTML attributes
90
+ # and place in the <tt><a></tt> tag.
91
+ def auto_link_urls_custom(text, href_options = {})
92
+ text.gsub(Twitter::Regex[:valid_url]) do
93
+ all, before, url, protocol = $1, $2, $3, $4
94
+ options = tag_options(href_options.stringify_keys) || ""
95
+ full_url = (protocol == "www." ? "http://#{url}" : url)
96
+ "#{before}<a href=\"#{full_url}\"#{options}>#{url}</a>"
97
+ end
98
+ end
99
+
100
+ end
101
+ end
data/lib/extractor.rb ADDED
@@ -0,0 +1,69 @@
1
+
2
+ module Twitter
3
+ # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
4
+ # of usernames, lists, URLs and hashtags.
5
+ module Extractor
6
+
7
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
8
+ # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
9
+ # will be returned.
10
+ #
11
+ # If a block is given then it will be called for each username.
12
+ def extract_mentioned_screen_names(text) # :yields: username
13
+ return [] unless text
14
+
15
+ possible_screen_names = []
16
+ text.scan(Twitter::Regex[:extract_mentions]) {|before,sn| possible_screen_names << sn }
17
+ possible_screen_names.each{|sn| yield sn } if block_given?
18
+ possible_screen_names
19
+ end
20
+
21
+ # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
22
+ # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
23
+ #
24
+ # If a block is given then it will be called with the username replied to (if any)
25
+ def extract_reply_screen_name(text) # :yields: username
26
+ return nil unless text
27
+
28
+ possible_screen_name = text.match(Twitter::Regex[:extract_reply])
29
+ return unless possible_screen_name.respond_to?(:captures)
30
+ screen_name = possible_screen_name.captures.first
31
+ yield screen_name if block_given?
32
+ screen_name
33
+ end
34
+
35
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
36
+ # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
37
+ # will be returned.
38
+ #
39
+ # If a block is given then it will be called for each URL.
40
+ def extract_urls(text) # :yields: url
41
+ return [] unless text
42
+
43
+ urls = []
44
+ text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
45
+ urls << (protocol == "www." ? "http://#{url}" : url)
46
+ end
47
+ urls.each{|url| yield url } if block_given?
48
+ urls
49
+ end
50
+
51
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
52
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
53
+ # will be returned. The array returned will not include the leading <tt>#</tt>
54
+ # character.
55
+ #
56
+ # If a block is given then it will be called for each hashtag.
57
+ def extract_hashtags(text) # :yields: hashtag_text
58
+ return [] unless text
59
+
60
+ tags = []
61
+ text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
62
+ tags << hash_text
63
+ end
64
+ tags.each{|tag| yield tag } if block_given?
65
+ tags
66
+ end
67
+
68
+ end
69
+ end
data/lib/regex.rb ADDED
@@ -0,0 +1,74 @@
1
+
2
+ module Twitter
3
+ # A collection of regular expressions for parsing Tweet text. The regular expression
4
+ # list is frozen at load time to ensure immutability. These reular expressions are
5
+ # used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
6
+ # sure these reular expressions work with Tweets in all languages.
7
+ class Regex
8
+ REGEXEN = {} # :nodoc:
9
+
10
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
11
+ # to access both the list of characters and a pattern suitible for use with String#split
12
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
13
+ UNICODE_SPACES = [
14
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
15
+ 0x0020, # White_Space # Zs SPACE
16
+ 0x0085, # White_Space # Cc <control-0085>
17
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
18
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
19
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
20
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
21
+ 0x2028, # White_Space # Zl LINE SEPARATOR
22
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
23
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
24
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
25
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
26
+ ].flatten.freeze
27
+ REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
28
+
29
+ REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])[@@]([a-zA-Z0-9_]{1,20})/
30
+ REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*[@@]([a-zA-Z0-9_]{1,20})/o
31
+
32
+ REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
33
+
34
+ # Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
35
+ LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
36
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
37
+
38
+ # Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
39
+ HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
40
+ REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
41
+ REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\x80-\xff\-]{0,79})?/
42
+ REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
43
+
44
+ # URL related hash regex collection
45
+ REGEXEN[:valid_preceeding_chars] = /(?:[^\/"':!=]|^)/
46
+ REGEXEN[:valid_domain] = /[a-z0-9\.-]+\.[a-z]{2,}(?::[0-9]+)?/i
47
+ REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:@&=\+\$\/%#\[\]\-_\.,~]/i
48
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
49
+ # 1. Allow ) for Wikipedia URLs.
50
+ # 2. Allow =&# for empty URL parameters and other URL-join artifacts
51
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
52
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:@&=\+\$\/%#\[\]\-_\.,~]/i
53
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
54
+ REGEXEN[:valid_url] = %r{
55
+ ( # $1 total match
56
+ (#{REGEXEN[:valid_preceeding_chars]}) # $2 Preceeding chracter
57
+ ( # $3 URL
58
+ (https?:\/\/|www\.) # $4 Protocol or beginning
59
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
60
+ (/#{REGEXEN[:valid_url_path_chars]}*#{REGEXEN[:valid_url_path_ending_chars]}?)? # $6 URL Path
61
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $7 Query String
62
+ )
63
+ )
64
+ }iox;
65
+
66
+ REGEXEN.each_pair{|k,v| v.freeze }
67
+
68
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
69
+ # is not a known symbol a <tt>nil</tt> will be returned.
70
+ def self.[](key)
71
+ REGEXEN[key]
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,13 @@
1
+
2
+ raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
3
+
4
+ require 'rubygems'
5
+
6
+ # Needed for auto-linking
7
+ require 'action_view'
8
+
9
+ require File.join(File.dirname(__FILE__), 'regex')
10
+ require File.join(File.dirname(__FILE__), 'autolink')
11
+ require File.join(File.dirname(__FILE__), 'extractor')
12
+ require File.join(File.dirname(__FILE__), 'unicode')
13
+ require File.join(File.dirname(__FILE__), 'validation')
data/lib/unicode.rb ADDED
@@ -0,0 +1,27 @@
1
+
2
+ module Twitter
3
+ # This module lazily defines constants of the form Uxxxx for all Unicode
4
+ # codepoints from U0000 to U10FFFF. The value of each constant is the
5
+ # UTF-8 string for the codepoint.
6
+ # Examples:
7
+ # copyright = Unicode::U00A9
8
+ # euro = Unicode::U20AC
9
+ # infinity = Unicode::U221E
10
+ #
11
+ module Unicode
12
+ CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
13
+
14
+ def self.const_missing(name)
15
+ # Check that the constant name is of the right form: U0000 to U10FFFF
16
+ if name.to_s =~ CODEPOINT_REGEX
17
+ # Convert the codepoint to an immutable UTF-8 string,
18
+ # define a real constant for that value and return the value
19
+ #p name, name.class
20
+ const_set(name, [$1.to_i(16)].pack("U").freeze)
21
+ else # Raise an error for constants that are not Unicode.
22
+ raise NameError, "Uninitialized constant: Unicode::#{name}"
23
+ end
24
+ end
25
+ end
26
+
27
+ end