twitter-text 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,13 @@
1
+ Copyright 2010 Twitter, Inc.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not
4
+ use this file except in compliance with the License. You may obtain a copy of
5
+ the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11
+ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12
+ License for the specific language governing permissions and limitations under
13
+ the License.
data/README.rdoc ADDED
@@ -0,0 +1,58 @@
1
+ == twitter-text
2
+
3
+ A gem that provides text processing routines for Twitter Tweets. The major
4
+ reason for this is to unify the various auto-linking and extraction of
5
+ usernames, lists, hashtags and URLs.
6
+
7
+ == Extraction Examples
8
+
9
+ # Extraction
10
+ class MyClass
11
+ include Twitter::Extractor
12
+ usernames = extract_mentioned_screen_names("Mentioning @twitter and @jack")
13
+ # usernames = ["twitter", "jack"]
14
+ end
15
+
16
+ # Extraction with a block argument
17
+ class MyClass
18
+ include Twitter::Extractor
19
+ extract_reply_screen_name("@twitter are you hiring?").do |username|
20
+ # username = "twitter"
21
+ end
22
+ end
23
+
24
+ == Auto-linking Examples
25
+
26
+ # Auto-link
27
+ class MyClass
28
+ include Twitter::Autolink
29
+
30
+ html = auto_link("link @user, please #request")
31
+ end
32
+
33
+ === Usernames
34
+
35
+ Username extraction and linking matches all valid Twitter usernames but does
36
+ not verify that the username is a valid Twitter account.
37
+
38
+ === Lists
39
+
40
+ Auto-link and extract list names when they are written in @user/list-name
41
+ format.
42
+
43
+ === Hashtags
44
+
45
+ Auto-link and extract hashtags, where a hashtag contains any latin letter or
46
+ number but cannot be solely numbers.
47
+
48
+ === URLs
49
+
50
+ Auto-linking and extraction of URLs differs from the Rails default so that it
51
+ will work correctly in Tweets written in languages that do not include spaces
52
+ between words.
53
+
54
+ === International
55
+
56
+ Special care has been taken to be sure that auto-linking and extraction work
57
+ in Tweets of all languages. This means that languages without spaces between
58
+ words should work equally well.
data/Rakefile ADDED
@@ -0,0 +1,92 @@
1
+ require 'rubygems' unless ENV['NO_RUBYGEMS']
2
+ require 'rake/gempackagetask'
3
+ require 'rake/rdoctask'
4
+ require 'rubygems/specification'
5
+ require 'date'
6
+ require 'spec/rake/spectask'
7
+ require 'spec/rake/verify_rcov'
8
+
9
+
10
+ spec = Gem::Specification.new do |s|
11
+ s.name = "twitter-text"
12
+ s.version = "1.0"
13
+ s.author = "Matt Sanford"
14
+ s.email = "matt@twitter.com"
15
+ s.homepage = "http://twitter.com"
16
+ s.description = s.summary = "A gem that provides text handling for Twitter"
17
+
18
+ s.platform = Gem::Platform::RUBY
19
+ s.has_rdoc = true
20
+ s.summary = "Twitter text handling library"
21
+
22
+ s.add_dependency "action_view"
23
+
24
+ s.require_path = 'lib'
25
+ s.autorequire = ''
26
+ s.files = %w(LICENSE README.rdoc Rakefile TODO) + Dir.glob("{lib,spec}/**/*")
27
+ end
28
+
29
+ task :default => :spec
30
+
31
+ desc "Run specs"
32
+ Spec::Rake::SpecTask.new do |t|
33
+ t.spec_files = FileList['spec/**/*_spec.rb']
34
+ t.spec_opts = %w(-fs --color)
35
+ end
36
+
37
+ desc "Run all examples with RCov"
38
+ Spec::Rake::SpecTask.new('spec:rcov') do |t|
39
+ t.spec_files = FileList['spec/**/*.rb']
40
+ t.rcov = true
41
+ t.rcov_opts = ['--exclude', 'spec']
42
+ end
43
+
44
+ namespace :test do
45
+ namespace :conformance do
46
+ desc "Update conformance testing data"
47
+ task :update do
48
+ dir = File.join(File.dirname(__FILE__), "test", "twitter-text-conformance")
49
+ puts "Updating conformance data ... "
50
+ system("cd #{dir} && git pull origin master") || exit(1)
51
+ puts "Updating conformance data ... DONE"
52
+ end
53
+
54
+ desc "Run conformance test suite"
55
+ task :run do
56
+ ruby "test/conformance_test.rb"
57
+ end
58
+ end
59
+
60
+ desc "Run conformance test suite"
61
+ task :conformance => ['conformance:update', 'conformance:run'] do
62
+ end
63
+ end
64
+
65
+
66
+ namespace :doc do
67
+ Rake::RDocTask.new do |rd|
68
+ rd.main = "README.rdoc"
69
+ rd.rdoc_dir = 'doc'
70
+ rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
71
+ end
72
+ end
73
+
74
+ Rake::GemPackageTask.new(spec) do |pkg|
75
+ pkg.gem_spec = spec
76
+ end
77
+
78
+ desc "install the gem locally"
79
+ task :install => [:package] do
80
+ sh %{sudo gem install pkg/#{GEM}-#{GEM_VERSION}}
81
+ end
82
+
83
+ desc "create a gemspec file"
84
+ task :make_spec do
85
+ File.open("#{GEM}.gemspec", "w") do |file|
86
+ file.puts spec.to_ruby
87
+ end
88
+ end
89
+
90
+ desc "runs cruise control build"
91
+ task :cruise => [:spec, 'test:conformance'] do
92
+ end
data/TODO ADDED
@@ -0,0 +1,3 @@
1
+ TODO:
2
+
3
+ * More tests
data/lib/autolink.rb ADDED
@@ -0,0 +1,101 @@
1
+
2
+ module Twitter
3
+ # A module for including Tweet auto-linking in a class. The primary use of this is for helpers/views so they can auto-link
4
+ # usernames, lists, hashtags and URLs.
5
+ module Autolink
6
+ include ActionView::Helpers::TagHelper #tag_options needed by auto_link
7
+
8
+ # Default CSS class for auto-linked URLs
9
+ DEFAULT_URL_CLASS = "tweet-url"
10
+ # Default CSS class for auto-linked lists (along with the url class)
11
+ DEFAULT_LIST_CLASS = "list-slug"
12
+ # Default CSS class for auto-linked usernames (along with the url class)
13
+ DEFAULT_USERNAME_CLASS = "username"
14
+ # Default CSS class for auto-linked hashtags (along with the url class)
15
+ DEFAULT_HASHTAG_CLASS = "hashtag"
16
+
17
+ # Add <tt><a></a></tt> tags around the usernames, lists, hashtags and URLs in the provided <tt>text</tt>. The
18
+ # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
19
+ # hash:
20
+ #
21
+ # <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
22
+ # <tt>:list_class</tt>:: class to add to list <tt><a></tt> tags
23
+ # <tt>:username_class</tt>:: class to add to username <tt><a></tt> tags
24
+ # <tt>:hashtag_class</tt>:: class to add to hashtag <tt><a></tt> tags
25
+ # <tt>:username_url_base</tt>:: the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
26
+ # <tt>:list_url_base</tt>:: the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
27
+ # <tt>:hashtag_url_base</tt>:: the value for <tt>href</tt> attribute on hashtag links. The <tt>#hashtag</tt> (minus the <tt>#</tt>) will be appended at the end of this.
28
+ def auto_link(text, options = {}, &block) # :yields: hashtag_or_list_or_username
29
+ options = options.dup
30
+ auto_link_usernames_or_lists(auto_link_urls_custom(auto_link_hashtags(text), options, &block), &block)
31
+ end
32
+
33
+ # Add <tt><a></a></tt> tags around the usernames and lists in the provided <tt>text</tt>. The
34
+ # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
35
+ # hash:
36
+ #
37
+ # <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
38
+ # <tt>:list_class</tt>:: class to add to list <tt><a></tt> tags
39
+ # <tt>:username_class</tt>:: class to add to username <tt><a></tt> tags
40
+ # <tt>:username_url_base</tt>:: the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
41
+ # <tt>:list_url_base</tt>:: the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
42
+ def auto_link_usernames_or_lists(text, options = {}) # :yields: list_or_username
43
+ options = options.dup
44
+ options[:url_class] ||= DEFAULT_URL_CLASS
45
+ options[:list_class] ||= DEFAULT_LIST_CLASS
46
+ options[:username_class] ||= DEFAULT_USERNAME_CLASS
47
+ options[:username_url_base] ||= "http://twitter.com/"
48
+ options[:list_url_base] ||= "http://twitter.com/"
49
+
50
+ text.gsub(Twitter::Regex[:auto_link_usernames_or_lists]) do
51
+ if $4 && !options[:suppress_lists]
52
+ # the link is a list
53
+ text = list = "#{$3}#{$4}"
54
+ text = yield(list) if block_given?
55
+ "#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:list_class]}\" href=\"#{options[:list_url_base]}#{list.downcase}\">#{text}</a>"
56
+ else
57
+ # this is a screen name
58
+ text = $3
59
+ text = yield(text) if block_given?
60
+ "#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:username_class]}\" href=\"#{options[:username_url_base]}#{$3}\">#{text}</a>"
61
+ end
62
+ end
63
+ end
64
+
65
+ # Add <tt><a></a></tt> tags around the hashtags in the provided <tt>text</tt>. The
66
+ # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
67
+ # hash:
68
+ #
69
+ # <tt>:url_class</tt>:: class to add to all <tt><a></tt> tags
70
+ # <tt>:hashtag_class</tt>:: class to add to hashtag <tt><a></tt> tags
71
+ # <tt>:hashtag_url_base</tt>:: the value for <tt>href</tt> attribute. The hashtag text (minus the <tt>#</tt>) will be appended at the end of this.
72
+ #
73
+ def auto_link_hashtags(text, options = {}) # :yields: hashtag_text
74
+ options = options.dup
75
+ options[:url_class] ||= DEFAULT_URL_CLASS
76
+ options[:hashtag_class] ||= DEFAULT_HASHTAG_CLASS
77
+ options[:hashtag_url_base] ||= "http://twitter.com/search?q=%23"
78
+
79
+ text.gsub(Twitter::Regex[:auto_link_hashtags]) do
80
+ before = $1
81
+ hash = $2
82
+ text = $3
83
+ text = yield(text) if block_given?
84
+ "#{before}<a href=\"#{options[:hashtag_url_base]}#{text}\" title=\"##{text}\" class=\"#{options[:url_class]} #{options[:hashtag_class]}\">#{hash}#{text}</a>"
85
+ end
86
+ end
87
+
88
+ # Add <tt><a></a></tt> tags around the URLs in the provided <tt>text</tt>. Any
89
+ # elements in the <tt>href_options</tt> hash will be converted to HTML attributes
90
+ # and place in the <tt><a></tt> tag.
91
+ def auto_link_urls_custom(text, href_options = {})
92
+ text.gsub(Twitter::Regex[:valid_url]) do
93
+ all, before, url, protocol = $1, $2, $3, $4
94
+ options = tag_options(href_options.stringify_keys) || ""
95
+ full_url = (protocol == "www." ? "http://#{url}" : url)
96
+ "#{before}<a href=\"#{full_url}\"#{options}>#{url}</a>"
97
+ end
98
+ end
99
+
100
+ end
101
+ end
data/lib/extractor.rb ADDED
@@ -0,0 +1,69 @@
1
+
2
+ module Twitter
3
+ # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
4
+ # of usernames, lists, URLs and hashtags.
5
+ module Extractor
6
+
7
+ # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
8
+ # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
9
+ # will be returned.
10
+ #
11
+ # If a block is given then it will be called for each username.
12
+ def extract_mentioned_screen_names(text) # :yields: username
13
+ return [] unless text
14
+
15
+ possible_screen_names = []
16
+ text.scan(Twitter::Regex[:extract_mentions]) {|before,sn| possible_screen_names << sn }
17
+ possible_screen_names.each{|sn| yield sn } if block_given?
18
+ possible_screen_names
19
+ end
20
+
21
+ # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
22
+ # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
23
+ #
24
+ # If a block is given then it will be called with the username replied to (if any)
25
+ def extract_reply_screen_name(text) # :yields: username
26
+ return nil unless text
27
+
28
+ possible_screen_name = text.match(Twitter::Regex[:extract_reply])
29
+ return unless possible_screen_name.respond_to?(:captures)
30
+ screen_name = possible_screen_name.captures.first
31
+ yield screen_name if block_given?
32
+ screen_name
33
+ end
34
+
35
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
36
+ # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
37
+ # will be returned.
38
+ #
39
+ # If a block is given then it will be called for each URL.
40
+ def extract_urls(text) # :yields: url
41
+ return [] unless text
42
+
43
+ urls = []
44
+ text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
45
+ urls << (protocol == "www." ? "http://#{url}" : url)
46
+ end
47
+ urls.each{|url| yield url } if block_given?
48
+ urls
49
+ end
50
+
51
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
52
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
53
+ # will be returned. The array returned will not include the leading <tt>#</tt>
54
+ # character.
55
+ #
56
+ # If a block is given then it will be called for each hashtag.
57
+ def extract_hashtags(text) # :yields: hashtag_text
58
+ return [] unless text
59
+
60
+ tags = []
61
+ text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
62
+ tags << hash_text
63
+ end
64
+ tags.each{|tag| yield tag } if block_given?
65
+ tags
66
+ end
67
+
68
+ end
69
+ end
data/lib/regex.rb ADDED
@@ -0,0 +1,74 @@
1
+
2
+ module Twitter
3
+ # A collection of regular expressions for parsing Tweet text. The regular expression
4
+ # list is frozen at load time to ensure immutability. These reular expressions are
5
+ # used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
6
+ # sure these reular expressions work with Tweets in all languages.
7
+ class Regex
8
+ REGEXEN = {} # :nodoc:
9
+
10
+ # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
11
+ # to access both the list of characters and a pattern suitible for use with String#split
12
+ # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
13
+ UNICODE_SPACES = [
14
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
15
+ 0x0020, # White_Space # Zs SPACE
16
+ 0x0085, # White_Space # Cc <control-0085>
17
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
18
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
19
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
20
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
21
+ 0x2028, # White_Space # Zl LINE SEPARATOR
22
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
23
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
24
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
25
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
26
+ ].flatten.freeze
27
+ REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
28
+
29
+ REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])[@@]([a-zA-Z0-9_]{1,20})/
30
+ REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*[@@]([a-zA-Z0-9_]{1,20})/o
31
+
32
+ REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
33
+
34
+ # Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
35
+ LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
36
+ REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
37
+
38
+ # Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
39
+ HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
40
+ REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
41
+ REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\x80-\xff\-]{0,79})?/
42
+ REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
43
+
44
+ # URL related hash regex collection
45
+ REGEXEN[:valid_preceeding_chars] = /(?:[^\/"':!=]|^)/
46
+ REGEXEN[:valid_domain] = /[a-z0-9\.-]+\.[a-z]{2,}(?::[0-9]+)?/i
47
+ REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:@&=\+\$\/%#\[\]\-_\.,~]/i
48
+ # Valid end-of-path chracters (so /foo. does not gobble the period).
49
+ # 1. Allow ) for Wikipedia URLs.
50
+ # 2. Allow =&# for empty URL parameters and other URL-join artifacts
51
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
52
+ REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:@&=\+\$\/%#\[\]\-_\.,~]/i
53
+ REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
54
+ REGEXEN[:valid_url] = %r{
55
+ ( # $1 total match
56
+ (#{REGEXEN[:valid_preceeding_chars]}) # $2 Preceeding chracter
57
+ ( # $3 URL
58
+ (https?:\/\/|www\.) # $4 Protocol or beginning
59
+ (#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
60
+ (/#{REGEXEN[:valid_url_path_chars]}*#{REGEXEN[:valid_url_path_ending_chars]}?)? # $6 URL Path
61
+ (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $7 Query String
62
+ )
63
+ )
64
+ }iox;
65
+
66
+ REGEXEN.each_pair{|k,v| v.freeze }
67
+
68
+ # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
69
+ # is not a known symbol a <tt>nil</tt> will be returned.
70
+ def self.[](key)
71
+ REGEXEN[key]
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,13 @@
1
+
2
+ raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
3
+
4
+ require 'rubygems'
5
+
6
+ # Needed for auto-linking
7
+ require 'action_view'
8
+
9
+ require File.join(File.dirname(__FILE__), 'regex')
10
+ require File.join(File.dirname(__FILE__), 'autolink')
11
+ require File.join(File.dirname(__FILE__), 'extractor')
12
+ require File.join(File.dirname(__FILE__), 'unicode')
13
+ require File.join(File.dirname(__FILE__), 'validation')
data/lib/unicode.rb ADDED
@@ -0,0 +1,27 @@
1
+
2
+ module Twitter
3
+ # This module lazily defines constants of the form Uxxxx for all Unicode
4
+ # codepoints from U0000 to U10FFFF. The value of each constant is the
5
+ # UTF-8 string for the codepoint.
6
+ # Examples:
7
+ # copyright = Unicode::U00A9
8
+ # euro = Unicode::U20AC
9
+ # infinity = Unicode::U221E
10
+ #
11
+ module Unicode
12
+ CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
13
+
14
+ def self.const_missing(name)
15
+ # Check that the constant name is of the right form: U0000 to U10FFFF
16
+ if name.to_s =~ CODEPOINT_REGEX
17
+ # Convert the codepoint to an immutable UTF-8 string,
18
+ # define a real constant for that value and return the value
19
+ #p name, name.class
20
+ const_set(name, [$1.to_i(16)].pack("U").freeze)
21
+ else # Raise an error for constants that are not Unicode.
22
+ raise NameError, "Uninitialized constant: Unicode::#{name}"
23
+ end
24
+ end
25
+ end
26
+
27
+ end