RubyGems - twitter-text - Versions diffs - 1.0 - Mend

twitter-text 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

data/LICENSE ADDED Viewed

@@ -0,0 +1,13 @@
+Copyright 2010 Twitter, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not
+use this file except in compliance with the License. You may obtain a copy of
+the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+License for the specific language governing permissions and limitations under
+the License.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,58 @@
+== twitter-text
+A gem that provides text processing routines for Twitter Tweets. The major
+reason for this is to unify the various auto-linking and extraction of
+usernames, lists, hashtags and URLs.
+== Extraction Examples
+  # Extraction
+  class MyClass
+    include Twitter::Extractor
+    usernames = extract_mentioned_screen_names("Mentioning @twitter and @jack")
+    # usernames = ["twitter", "jack"]
+  end
+  # Extraction with a block argument
+  class MyClass
+    include Twitter::Extractor
+    extract_reply_screen_name("@twitter are you hiring?").do |username|
+      # username = "twitter"
+    end
+  end
+== Auto-linking Examples
+  # Auto-link
+  class MyClass
+    include Twitter::Autolink
+    html = auto_link("link @user, please #request")
+  end
+=== Usernames
+Username extraction and linking matches all valid Twitter usernames but does
+not verify that the username is a valid Twitter account.
+=== Lists
+Auto-link and extract list names when they are written in @user/list-name
+format.
+=== Hashtags
+Auto-link and extract hashtags, where a hashtag contains any latin letter or
+number but cannot be solely numbers.
+=== URLs
+Auto-linking and extraction of URLs differs from the Rails default so that it
+will work correctly in Tweets written in languages that do not include spaces
+between words.
+=== International
+Special care has been taken to be sure that auto-linking and extraction work
+in Tweets of all languages. This means that languages without spaces between
+words should work equally well.

data/Rakefile ADDED Viewed

@@ -0,0 +1,92 @@
+require 'rubygems' unless ENV['NO_RUBYGEMS']
+require 'rake/gempackagetask'
+require 'rake/rdoctask'
+require 'rubygems/specification'
+require 'date'
+require 'spec/rake/spectask'
+require 'spec/rake/verify_rcov'
+spec = Gem::Specification.new do |s|
+  s.name = "twitter-text"
+  s.version = "1.0"
+  s.author = "Matt Sanford"
+  s.email = "matt@twitter.com"
+  s.homepage = "http://twitter.com"
+  s.description = s.summary = "A gem that provides text handling for Twitter"
+  s.platform = Gem::Platform::RUBY
+  s.has_rdoc = true
+  s.summary = "Twitter text handling library"
+  s.add_dependency "action_view"
+  s.require_path = 'lib'
+  s.autorequire = ''
+  s.files = %w(LICENSE README.rdoc Rakefile TODO) + Dir.glob("{lib,spec}/**/*")
+end
+task :default => :spec
+desc "Run specs"
+Spec::Rake::SpecTask.new do |t|
+  t.spec_files = FileList['spec/**/*_spec.rb']
+  t.spec_opts = %w(-fs --color)
+end
+desc "Run all examples with RCov"
+Spec::Rake::SpecTask.new('spec:rcov') do |t|
+  t.spec_files = FileList['spec/**/*.rb']
+  t.rcov = true
+  t.rcov_opts = ['--exclude', 'spec']
+end
+namespace :test do
+  namespace :conformance do
+    desc "Update conformance testing data"
+    task :update do
+      dir = File.join(File.dirname(__FILE__), "test", "twitter-text-conformance")
+      puts "Updating conformance data ... "
+      system("cd #{dir} && git pull origin master") || exit(1)
+      puts "Updating conformance data ... DONE"
+    end
+    desc "Run conformance test suite"
+    task :run do
+      ruby "test/conformance_test.rb"
+    end
+  end
+  desc "Run conformance test suite"
+  task :conformance => ['conformance:update', 'conformance:run'] do
+  end
+end
+namespace :doc do
+  Rake::RDocTask.new do |rd|
+    rd.main = "README.rdoc"
+    rd.rdoc_dir = 'doc'
+    rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
+  end
+end
+Rake::GemPackageTask.new(spec) do |pkg|
+  pkg.gem_spec = spec
+end
+desc "install the gem locally"
+task :install => [:package] do
+  sh %{sudo gem install pkg/#{GEM}-#{GEM_VERSION}}
+end
+desc "create a gemspec file"
+task :make_spec do
+  File.open("#{GEM}.gemspec", "w") do |file|
+    file.puts spec.to_ruby
+  end
+end
+desc "runs cruise control build"
+task :cruise => [:spec, 'test:conformance'] do
+end

data/TODO ADDED Viewed

@@ -0,0 +1,3 @@
+TODO:
+ * More tests

data/lib/autolink.rb ADDED Viewed

@@ -0,0 +1,101 @@
+module Twitter
+  # A module for including Tweet auto-linking in a class. The primary use of this is for helpers/views so they can auto-link
+  # usernames, lists, hashtags and URLs.
+  module Autolink
+    include ActionView::Helpers::TagHelper #tag_options needed by auto_link
+    # Default CSS class for auto-linked URLs
+    DEFAULT_URL_CLASS = "tweet-url"
+    # Default CSS class for auto-linked lists (along with the url class)
+    DEFAULT_LIST_CLASS = "list-slug"
+    # Default CSS class for auto-linked usernames (along with the url class)
+    DEFAULT_USERNAME_CLASS = "username"
+    # Default CSS class for auto-linked hashtags (along with the url class)
+    DEFAULT_HASHTAG_CLASS = "hashtag"
+    # Add <tt><a></a></tt> tags around the usernames, lists, hashtags and URLs in the provided <tt>text</tt>. The
+    # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
+    # hash:
+    #
+    # <tt>:url_class</tt>::     class to add to all <tt><a></tt> tags
+    # <tt>:list_class</tt>::    class to add to list <tt><a></tt> tags
+    # <tt>:username_class</tt>::    class to add to username <tt><a></tt> tags
+    # <tt>:hashtag_class</tt>::    class to add to hashtag <tt><a></tt> tags
+    # <tt>:username_url_base</tt>::      the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
+    # <tt>:list_url_base</tt>::      the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
+    # <tt>:hashtag_url_base</tt>::      the value for <tt>href</tt> attribute on hashtag links. The <tt>#hashtag</tt> (minus the <tt>#</tt>) will be appended at the end of this.
+    def auto_link(text, options = {}, &block) # :yields: hashtag_or_list_or_username
+      options = options.dup
+      auto_link_usernames_or_lists(auto_link_urls_custom(auto_link_hashtags(text), options, &block), &block)
+    end
+    # Add <tt><a></a></tt> tags around the usernames and lists in the provided <tt>text</tt>. The
+    # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
+    # hash:
+    #
+    # <tt>:url_class</tt>::     class to add to all <tt><a></tt> tags
+    # <tt>:list_class</tt>::    class to add to list <tt><a></tt> tags
+    # <tt>:username_class</tt>::    class to add to username <tt><a></tt> tags
+    # <tt>:username_url_base</tt>::      the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
+    # <tt>:list_url_base</tt>::      the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
+    def auto_link_usernames_or_lists(text, options = {}) # :yields: list_or_username
+      options = options.dup
+      options[:url_class] ||= DEFAULT_URL_CLASS
+      options[:list_class] ||= DEFAULT_LIST_CLASS
+      options[:username_class] ||= DEFAULT_USERNAME_CLASS
+      options[:username_url_base] ||= "http://twitter.com/"
+      options[:list_url_base] ||= "http://twitter.com/"
+      text.gsub(Twitter::Regex[:auto_link_usernames_or_lists]) do
+        if $4 && !options[:suppress_lists]
+          # the link is a list
+          text = list = "#{$3}#{$4}"
+          text = yield(list) if block_given?
+          "#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:list_class]}\" href=\"#{options[:list_url_base]}#{list.downcase}\">#{text}</a>"
+        else
+          # this is a screen name
+          text = $3
+          text = yield(text) if block_given?
+          "#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:username_class]}\" href=\"#{options[:username_url_base]}#{$3}\">#{text}</a>"
+        end
+      end
+    end
+    # Add <tt><a></a></tt> tags around the hashtags in the provided <tt>text</tt>. The
+    # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
+    # hash:
+    #
+    # <tt>:url_class</tt>::     class to add to all <tt><a></tt> tags
+    # <tt>:hashtag_class</tt>:: class to add to hashtag <tt><a></tt> tags
+    # <tt>:hashtag_url_base</tt>::      the value for <tt>href</tt> attribute. The hashtag text (minus the <tt>#</tt>) will be appended at the end of this.
+    #
+    def auto_link_hashtags(text, options = {})  # :yields: hashtag_text
+      options = options.dup
+      options[:url_class] ||= DEFAULT_URL_CLASS
+      options[:hashtag_class] ||= DEFAULT_HASHTAG_CLASS
+      options[:hashtag_url_base] ||= "http://twitter.com/search?q=%23"
+      text.gsub(Twitter::Regex[:auto_link_hashtags]) do
+        before = $1
+        hash = $2
+        text = $3
+        text = yield(text) if block_given?
+        "#{before}<a href=\"#{options[:hashtag_url_base]}#{text}\" title=\"##{text}\" class=\"#{options[:url_class]} #{options[:hashtag_class]}\">#{hash}#{text}</a>"
+      end
+    end
+    # Add <tt><a></a></tt> tags around the URLs in the provided <tt>text</tt>. Any
+    # elements in the <tt>href_options</tt> hash will be converted to HTML attributes
+    # and place in the <tt><a></tt> tag.
+    def auto_link_urls_custom(text, href_options = {})
+      text.gsub(Twitter::Regex[:valid_url]) do
+        all, before, url, protocol = $1, $2, $3, $4
+        options = tag_options(href_options.stringify_keys) || ""
+        full_url = (protocol == "www." ? "http://#{url}" : url)
+        "#{before}<a href=\"#{full_url}\"#{options}>#{url}</a>"
+      end
+    end
+  end
+end

data/lib/extractor.rb ADDED Viewed

@@ -0,0 +1,69 @@
+module Twitter
+  # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
+  # of usernames, lists, URLs and hashtags.
+  module Extractor
+    # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
+    # will be returned.
+    #
+    # If a block is given then it will be called for each username.
+    def extract_mentioned_screen_names(text) # :yields: username
+      return [] unless text
+      possible_screen_names = []
+      text.scan(Twitter::Regex[:extract_mentions]) {|before,sn| possible_screen_names << sn }
+      possible_screen_names.each{|sn| yield sn } if block_given?
+      possible_screen_names
+    end
+    # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
+    #
+    # If a block is given then it will be called with the username replied to (if any)
+    def extract_reply_screen_name(text) # :yields: username
+      return nil unless text
+      possible_screen_name = text.match(Twitter::Regex[:extract_reply])
+      return unless possible_screen_name.respond_to?(:captures)
+      screen_name = possible_screen_name.captures.first
+      yield screen_name if block_given?
+      screen_name
+    end
+    # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
+    # will be returned.
+    #
+    # If a block is given then it will be called for each URL.
+    def extract_urls(text) # :yields: url
+      return [] unless text
+      urls = []
+      text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
+        urls << (protocol == "www." ? "http://#{url}" : url)
+      end
+      urls.each{|url| yield url } if block_given?
+      urls
+    end
+    # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
+    # will be returned. The array returned will not include the leading <tt>#</tt>
+    # character.
+    #
+    # If a block is given then it will be called for each hashtag.
+    def extract_hashtags(text) # :yields: hashtag_text
+      return [] unless text
+      tags = []
+      text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
+        tags << hash_text
+      end
+      tags.each{|tag| yield tag } if block_given?
+      tags
+    end
+  end
+end

data/lib/regex.rb ADDED Viewed

@@ -0,0 +1,74 @@
+module Twitter
+  # A collection of regular expressions for parsing Tweet text. The regular expression
+  # list is frozen at load time to ensure immutability. These reular expressions are
+  # used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
+  # sure these reular expressions work with Tweets in all languages.
+  class Regex
+    REGEXEN = {} # :nodoc:
+    # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
+    # to access both the list of characters and a pattern suitible for use with String#split
+    #  Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
+    UNICODE_SPACES = [
+          (0x0009..0x000D).to_a,  # White_Space # Cc   [5] <control-0009>..<control-000D>
+          0x0020,          # White_Space # Zs       SPACE
+          0x0085,          # White_Space # Cc       <control-0085>
+          0x00A0,          # White_Space # Zs       NO-BREAK SPACE
+          0x1680,          # White_Space # Zs       OGHAM SPACE MARK
+          0x180E,          # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+          (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+          0x2028,          # White_Space # Zl       LINE SEPARATOR
+          0x2029,          # White_Space # Zp       PARAGRAPH SEPARATOR
+          0x202F,          # White_Space # Zs       NARROW NO-BREAK SPACE
+          0x205F,          # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+          0x3000,          # White_Space # Zs       IDEOGRAPHIC SPACE
+        ].flatten.freeze
+    REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
+    REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])[@＠]([a-zA-Z0-9_]{1,20})/
+    REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*[@＠]([a-zA-Z0-9_]{1,20})/o
+    REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
+    # Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
+    LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
+    REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
+    # Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
+    HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
+    REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|＃)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
+    REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@＠]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\x80-\xff\-]{0,79})?/
+    REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
+    # URL related hash regex collection
+    REGEXEN[:valid_preceeding_chars] = /(?:[^\/"':!=]|^)/
+    REGEXEN[:valid_domain] = /[a-z0-9\.-]+\.[a-z]{2,}(?::[0-9]+)?/i
+    REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:@&=\+\$\/%#\[\]\-_\.,~]/i
+    # Valid end-of-path chracters (so /foo. does not gobble the period).
+    #   1. Allow ) for Wikipedia URLs.
+    #   2. Allow =&# for empty URL parameters and other URL-join artifacts
+    REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
+    REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:@&=\+\$\/%#\[\]\-_\.,~]/i
+    REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
+    REGEXEN[:valid_url] = %r{
+      (                                                                                     #   $1 total match
+        (#{REGEXEN[:valid_preceeding_chars]})                                               #   $2 Preceeding chracter
+        (                                                                                   #   $3 URL
+          (https?:\/\/|www\.)                                                               #   $4 Protocol or beginning
+          (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s) and optional post number
+          (/#{REGEXEN[:valid_url_path_chars]}*#{REGEXEN[:valid_url_path_ending_chars]}?)?   #   $6 URL Path
+          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $7 Query String
+        )
+      )
+    }iox;
+    REGEXEN.each_pair{|k,v| v.freeze }
+    # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
+    # is not a known symbol a <tt>nil</tt> will be returned.
+    def self.[](key)
+      REGEXEN[key]
+    end
+  end
+end

data/lib/twitter-text.rb ADDED Viewed

@@ -0,0 +1,13 @@
+raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
+require 'rubygems'
+# Needed for auto-linking
+require 'action_view'
+require File.join(File.dirname(__FILE__), 'regex')
+require File.join(File.dirname(__FILE__), 'autolink')
+require File.join(File.dirname(__FILE__), 'extractor')
+require File.join(File.dirname(__FILE__), 'unicode')
+require File.join(File.dirname(__FILE__), 'validation')

data/lib/unicode.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module Twitter
+  # This module lazily defines constants of the form Uxxxx for all Unicode
+  # codepoints from U0000 to U10FFFF. The value of each constant is the
+  # UTF-8 string for the codepoint.
+  # Examples:
+  #   copyright = Unicode::U00A9
+  #   euro = Unicode::U20AC
+  #   infinity = Unicode::U221E
+  #
+  module Unicode
+    CODEPOINT_REGEX = /^U_?([0-9a-fA-F]{4,5}|10[0-9a-fA-F]{4})$/
+    def self.const_missing(name)
+      # Check that the constant name is of the right form: U0000 to U10FFFF
+      if name.to_s =~ CODEPOINT_REGEX
+        # Convert the codepoint to an immutable UTF-8 string,
+        # define a real constant for that value and return the value
+        #p name, name.class
+        const_set(name, [$1.to_i(16)].pack("U").freeze)
+      else  # Raise an error for constants that are not Unicode.
+        raise NameError, "Uninitialized constant: Unicode::#{name}"
+      end
+    end
+  end
+end