RubyGems - redaranj-twitter-text - Versions diffs - 1.0.4.191 - Mend

redaranj-twitter-text 1.0.4.191

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/LICENSE ADDED

@@ -0,0 +1,13 @@
+Copyright 2010 Twitter, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not
+use this file except in compliance with the License. You may obtain a copy of
+the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+License for the specific language governing permissions and limitations under
+the License.

data/README.rdoc ADDED

@@ -0,0 +1,66 @@
+== twitter-text
+A gem that provides text processing routines for Twitter Tweets. The major
+reason for this is to unify the various auto-linking and extraction of
+usernames, lists, hashtags and URLs.
+== Extraction Examples
+  # Extraction
+  class MyClass
+    include Twitter::Extractor
+    usernames = extract_mentioned_screen_names("Mentioning @twitter and @jack")
+    # usernames = ["twitter", "jack"]
+  end
+  # Extraction with a block argument
+  class MyClass
+    include Twitter::Extractor
+    extract_reply_screen_name("@twitter are you hiring?").do |username|
+      # username = "twitter"
+    end
+  end
+== Auto-linking Examples
+  # Auto-link
+  class MyClass
+    include Twitter::Autolink
+    html = auto_link("link @user, please #request")
+  end
+=== Usernames
+Username extraction and linking matches all valid Twitter usernames but does
+not verify that the username is a valid Twitter account.
+=== Lists
+Auto-link and extract list names when they are written in @user/list-name
+format.
+=== Hashtags
+Auto-link and extract hashtags, where a hashtag contains any latin letter or
+number but cannot be solely numbers.
+=== URLs
+Auto-linking and extraction of URLs differs from the Rails default so that it
+will work correctly in Tweets written in languages that do not include spaces
+between words.
+=== International
+Special care has been taken to be sure that auto-linking and extraction work
+in Tweets of all languages. This means that languages without spaces between
+words should work equally well.
+=== Conformance
+To run the Conformance suite, you'll need to add that project as a git submodule.  From the root twitter-text-rb directory, run:
+  git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
+  git submodule init
+  git submodule update

data/Rakefile ADDED

@@ -0,0 +1,115 @@
+require 'rubygems' unless ENV['NO_RUBYGEMS']
+require 'rake/gempackagetask'
+require 'rake/rdoctask'
+require 'rubygems/specification'
+require 'date'
+require 'spec/rake/spectask'
+require 'spec/rake/verify_rcov'
+require 'digest'
+spec = Gem::Specification.new do |s|
+  s.name = "twitter-text"
+  s.version = "1.0.4"
+  s.authors = ["Matt Sanford", "Patrick Ewing"]
+  s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com"]
+  s.homepage = "http://twitter.com"
+  s.description = s.summary = "A gem that provides text handling for Twitter"
+  s.platform = Gem::Platform::RUBY
+  s.has_rdoc = true
+  s.summary = "Twitter text handling library"
+  s.add_dependency "actionpack"
+  s.require_path = 'lib'
+  s.autorequire = ''
+  s.files = %w(LICENSE README.rdoc Rakefile TODO) + Dir.glob("{lib,spec}/**/*")
+end
+task :default => :spec
+desc "Run specs"
+Spec::Rake::SpecTask.new do |t|
+  t.spec_files = FileList['spec/**/*_spec.rb']
+  t.spec_opts = %w(-fs --color)
+end
+desc "Run all examples with RCov"
+Spec::Rake::SpecTask.new('spec:rcov') do |t|
+  t.spec_files = FileList['spec/**/*.rb']
+  t.rcov = true
+  t.rcov_opts = ['--exclude', 'spec']
+end
+def conformance_version(dir)
+  Dir[File.join(dir, '*')].inject(Digest::SHA1.new){|digest, file| digest.update(Digest::SHA1.file(file).hexdigest) }
+end
+namespace :test do
+  namespace :conformance do
+    desc "Update conformance testing data"
+    task :update do
+      puts "Updating conformance data ... "
+      system("git submodule init") || raise("Failed to init submodule")
+      system("git submodule update") || raise("Failed to update submodule")
+      puts "Updating conformance data ... DONE"
+    end
+    desc "Change conformance test data to the lastest version"
+    task :latest => ['conformance:update'] do
+      current_dir = File.dirname(__FILE__)
+      submodule_dir = File.join(File.dirname(__FILE__), "test", "twitter-text-conformance")
+      version_before = conformance_version(submodule_dir)
+      system("cd #{submodule_dir} && git pull origin master") || raise("Failed to pull submodule version")
+      system("cd #{current_dir}")
+      if conformance_version(submodule_dir) != version_before
+        system("cd #{current_dir} && git add #{submodule_dir}") || raise("Failed to add upgrade files")
+        system("git commit -m \"Upgraded to the latest conformance suite\" #{submodule_dir}") || raise("Failed to commit upgraded conformacne data")
+        puts "Upgraded conformance suite."
+      else
+        puts "No conformance suite changes."
+      end
+    end
+    desc "Run conformance test suite"
+    task :run do
+      ruby "test/conformance_test.rb"
+    end
+  end
+  desc "Run conformance test suite"
+  task :conformance => ['conformance:latest', 'conformance:run'] do
+  end
+end
+namespace :doc do
+  Rake::RDocTask.new do |rd|
+    rd.main = "README.rdoc"
+    rd.rdoc_dir = 'doc'
+    rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
+  end
+end
+Rake::GemPackageTask.new(spec) do |pkg|
+  pkg.gem_spec = spec
+end
+desc "install the gem locally"
+task :install => [:package] do
+  sh %{sudo gem install pkg/#{GEM}-#{GEM_VERSION}}
+end
+desc "create a gemspec file"
+task :make_spec do
+  File.open("#{GEM}.gemspec", "w") do |file|
+    file.puts spec.to_ruby
+  end
+end
+desc "runs cruise control build"
+task :cruise => [:spec, 'test:conformance'] do
+end

data/TODO ADDED

@@ -0,0 +1,3 @@
+TODO:
+ * More tests

data/lib/autolink.rb ADDED

@@ -0,0 +1,118 @@
+# encoding: utf-8
+module Twitter
+  # A module for including Tweet auto-linking in a class. The primary use of this is for helpers/views so they can auto-link
+  # usernames, lists, hashtags and URLs.
+  module Autolink
+    include ActionView::Helpers::TagHelper #tag_options needed by auto_link
+    WWW_REGEX = /www\./i #:nodoc:
+    # Default CSS class for auto-linked URLs
+    DEFAULT_URL_CLASS = "tweet-url"
+    # Default CSS class for auto-linked lists (along with the url class)
+    DEFAULT_LIST_CLASS = "list-slug"
+    # Default CSS class for auto-linked usernames (along with the url class)
+    DEFAULT_USERNAME_CLASS = "username"
+    # Default CSS class for auto-linked hashtags (along with the url class)
+    DEFAULT_HASHTAG_CLASS = "hashtag"
+    # HTML attribute for robot nofollow behavior (default)
+    HTML_ATTR_NO_FOLLOW = " rel=\"nofollow\""
+    # Add <tt><a></a></tt> tags around the usernames, lists, hashtags and URLs in the provided <tt>text</tt>. The
+    # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
+    # hash:
+    #
+    # <tt>:url_class</tt>::     class to add to all <tt><a></tt> tags
+    # <tt>:list_class</tt>::    class to add to list <tt><a></tt> tags
+    # <tt>:username_class</tt>::    class to add to username <tt><a></tt> tags
+    # <tt>:hashtag_class</tt>::    class to add to hashtag <tt><a></tt> tags
+    # <tt>:username_url_base</tt>::      the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
+    # <tt>:list_url_base</tt>::      the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
+    # <tt>:hashtag_url_base</tt>::      the value for <tt>href</tt> attribute on hashtag links. The <tt>#hashtag</tt> (minus the <tt>#</tt>) will be appended at the end of this.
+    # <tt>:suppress_lists</tt>::    disable auto-linking to lists
+    # <tt>:suppress_no_follow</tt>::   Do not add <tt>rel="nofollow"</tt> to auto-linked items
+    def auto_link(text, options = {})
+      auto_link_usernames_or_lists(
+        auto_link_urls_custom(
+          auto_link_hashtags(text, options),
+        options),
+      options)
+    end
+    # Add <tt><a></a></tt> tags around the usernames and lists in the provided <tt>text</tt>. The
+    # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
+    # hash:
+    #
+    # <tt>:url_class</tt>::     class to add to all <tt><a></tt> tags
+    # <tt>:list_class</tt>::    class to add to list <tt><a></tt> tags
+    # <tt>:username_class</tt>::    class to add to username <tt><a></tt> tags
+    # <tt>:username_url_base</tt>::      the value for <tt>href</tt> attribute on username links. The <tt>@username</tt> (minus the <tt>@</tt>) will be appended at the end of this.
+    # <tt>:list_url_base</tt>::      the value for <tt>href</tt> attribute on list links. The <tt>@username/list</tt> (minus the <tt>@</tt>) will be appended at the end of this.
+    # <tt>:suppress_lists</tt>::    disable auto-linking to lists
+    # <tt>:suppress_no_follow</tt>::   Do not add <tt>rel="nofollow"</tt> to auto-linked items
+    def auto_link_usernames_or_lists(text, options = {}) # :yields: list_or_username
+      options = options.dup
+      options[:url_class] ||= DEFAULT_URL_CLASS
+      options[:list_class] ||= DEFAULT_LIST_CLASS
+      options[:username_class] ||= DEFAULT_USERNAME_CLASS
+      options[:username_url_base] ||= "http://twitter.com/"
+      options[:list_url_base] ||= "http://twitter.com/"
+      extra_html = HTML_ATTR_NO_FOLLOW unless options[:suppress_no_follow]
+      text.gsub(Twitter::Regex[:auto_link_usernames_or_lists]) do
+        if $4 && !options[:suppress_lists]
+          # the link is a list
+          text = list = "#{$3}#{$4}"
+          text = yield(list) if block_given?
+          "#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:list_class]}\" href=\"#{options[:list_url_base]}#{list.downcase}\"#{extra_html}>#{text}</a>"
+        else
+          # this is a screen name
+          text = $3
+          text = yield(text) if block_given?
+          "#{$1}#{$2}<a class=\"#{options[:url_class]} #{options[:username_class]}\" href=\"#{options[:username_url_base]}#{text}\"#{extra_html}>#{text}</a>"
+        end
+      end
+    end
+    # Add <tt><a></a></tt> tags around the hashtags in the provided <tt>text</tt>. The
+    # <tt><a></tt> tags can be controlled with the following entries in the <tt>options</tt>
+    # hash:
+    #
+    # <tt>:url_class</tt>::     class to add to all <tt><a></tt> tags
+    # <tt>:hashtag_class</tt>:: class to add to hashtag <tt><a></tt> tags
+    # <tt>:hashtag_url_base</tt>::      the value for <tt>href</tt> attribute. The hashtag text (minus the <tt>#</tt>) will be appended at the end of this.
+    # <tt>:suppress_no_follow</tt>::   Do not add <tt>rel="nofollow"</tt> to auto-linked items
+    def auto_link_hashtags(text, options = {})  # :yields: hashtag_text
+      options = options.dup
+      options[:url_class] ||= DEFAULT_URL_CLASS
+      options[:hashtag_class] ||= DEFAULT_HASHTAG_CLASS
+      options[:hashtag_url_base] ||= "http://twitter.com/search?q=%23"
+      extra_html = HTML_ATTR_NO_FOLLOW unless options[:suppress_no_follow]
+      text.gsub(Twitter::Regex[:auto_link_hashtags]) do
+        before = $1
+        hash = $2
+        text = $3
+        text = yield(text) if block_given?
+        "#{before}<a href=\"#{options[:hashtag_url_base]}#{text}\" title=\"##{text}\" class=\"#{options[:url_class]} #{options[:hashtag_class]}\"#{extra_html}>#{hash}#{text}</a>"
+      end
+    end
+    # Add <tt><a></a></tt> tags around the URLs in the provided <tt>text</tt>. Any
+    # elements in the <tt>href_options</tt> hash will be converted to HTML attributes
+    # and place in the <tt><a></tt> tag. Unless <tt>href_options</tt> contains <tt>:suppress_no_follow</tt>
+    # the <tt>rel="nofollow"</tt> attribute will be added.
+    def auto_link_urls_custom(text, href_options = {})
+      options = href_options.dup
+      options[:rel] = "nofollow" unless options.delete(:suppress_no_follow)
+      text.gsub(Twitter::Regex[:valid_url]) do
+        all, before, url, protocol = $1, $2, $3, $4
+        html_attrs = tag_options(options.stringify_keys) || ""
+        full_url = (protocol =~ WWW_REGEX ? "http://#{url}" : url)
+        "#{before}<a href=\"#{full_url}\"#{html_attrs}>#{url}</a>"
+      end
+    end
+  end
+end

data/lib/extractor.rb ADDED

@@ -0,0 +1,70 @@
+# encoding: utf-8
+module Twitter
+  # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
+  # of usernames, lists, URLs and hashtags.
+  module Extractor
+    # Extracts a list of all usernames mentioned in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no username mentions an empty array
+    # will be returned.
+    #
+    # If a block is given then it will be called for each username.
+    def extract_mentioned_screen_names(text) # :yields: username
+      return [] unless text
+      possible_screen_names = []
+      text.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
+        possible_screen_names << sn unless after =~ Twitter::Regex[:at_signs]
+      end
+      possible_screen_names.each{|sn| yield sn } if block_given?
+      possible_screen_names
+    end
+    # Extracts the username username replied to in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or is not a reply nil will be returned.
+    #
+    # If a block is given then it will be called with the username replied to (if any)
+    def extract_reply_screen_name(text) # :yields: username
+      return nil unless text
+      possible_screen_name = text.match(Twitter::Regex[:extract_reply])
+      return unless possible_screen_name.respond_to?(:captures)
+      screen_name = possible_screen_name.captures.first
+      yield screen_name if block_given?
+      screen_name
+    end
+    # Extracts a list of all URLs included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no URLs an empty array
+    # will be returned.
+    #
+    # If a block is given then it will be called for each URL.
+    def extract_urls(text) # :yields: url
+      return [] unless text
+      urls = []
+      text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
+        urls << (protocol == "www." ? "http://#{url}" : url)
+      end
+      urls.each{|url| yield url } if block_given?
+      urls
+    end
+    # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
+    # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
+    # will be returned. The array returned will not include the leading <tt>#</tt>
+    # character.
+    #
+    # If a block is given then it will be called for each hashtag.
+    def extract_hashtags(text) # :yields: hashtag_text
+      return [] unless text
+      tags = []
+      text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
+        tags << hash_text
+      end
+      tags.each{|tag| yield tag } if block_given?
+      tags
+    end
+  end
+end

data/lib/regex.rb ADDED

@@ -0,0 +1,75 @@
+# encoding: utf-8
+module Twitter
+  # A collection of regular expressions for parsing Tweet text. The regular expression
+  # list is frozen at load time to ensure immutability. These reular expressions are
+  # used throughout the <tt>Twitter</tt> classes. Special care has been taken to make
+  # sure these reular expressions work with Tweets in all languages.
+  class Regex
+    REGEXEN = {} # :nodoc:
+    # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
+    # to access both the list of characters and a pattern suitible for use with String#split
+    #  Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
+    UNICODE_SPACES = [
+          (0x0009..0x000D).to_a,  # White_Space # Cc   [5] <control-0009>..<control-000D>
+          0x0020,          # White_Space # Zs       SPACE
+          0x0085,          # White_Space # Cc       <control-0085>
+          0x00A0,          # White_Space # Zs       NO-BREAK SPACE
+          0x1680,          # White_Space # Zs       OGHAM SPACE MARK
+          0x180E,          # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
+          (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+          0x2028,          # White_Space # Zl       LINE SEPARATOR
+          0x2029,          # White_Space # Zp       PARAGRAPH SEPARATOR
+          0x202F,          # White_Space # Zs       NARROW NO-BREAK SPACE
+          0x205F,          # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
+          0x3000,          # White_Space # Zs       IDEOGRAPHIC SPACE
+        ].flatten.freeze
+    REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
+    REGEXEN[:at_signs] = /[@＠]/
+    REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
+    REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
+    REGEXEN[:list_name] = /^[a-zA-Z].{0,79}$/
+    # Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
+    LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
+    REGEXEN[:latin_accents] = /[#{LATIN_ACCENTS}]+/o
+    # Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
+    HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
+    REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|＃)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
+    REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@＠]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\-]{0,79})?/
+    REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
+    # URL related hash regex collection
+    REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
+    REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-][^[:punct:]\s]|[^[:punct:]\s])+\.[a-z]{2,}(?::[0-9]+)?/i
+    REGEXEN[:valid_url_path_chars] = /[\.\,]?[a-z0-9!\*'\(\);:=\+\$\/%#\[\]\-_,~@]/i
+    # Valid end-of-path chracters (so /foo. does not gobble the period).
+    #   1. Allow ) for Wikipedia URLs.
+    #   2. Allow =&# for empty URL parameters and other URL-join artifacts
+    REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
+    REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
+    REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
+    REGEXEN[:valid_url] = %r{
+      (                                                                                     #   $1 total match
+        (#{REGEXEN[:valid_preceding_chars]})                                                #   $2 Preceeding chracter
+        (                                                                                   #   $3 URL
+          (https?:\/\/|www\.)                                                               #   $4 Protocol or beginning
+          (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s) and optional post number
+          (/#{REGEXEN[:valid_url_path_chars]}*#{REGEXEN[:valid_url_path_ending_chars]}?)?   #   $6 URL Path
+          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $7 Query String
+        )
+      )
+    }iox;
+    REGEXEN.each_pair{|k,v| v.freeze }
+    # Return the regular expression for a given <tt>key</tt>. If the <tt>key</tt>
+    # is not a known symbol a <tt>nil</tt> will be returned.
+    def self.[](key)
+      REGEXEN[key]
+    end
+  end
+end