twitter-text 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -56,3 +56,11 @@ between words.
56
56
  Special care has been taken to be sure that auto-linking and extraction work
57
57
  in Tweets of all languages. This means that languages without spaces between
58
58
  words should work equally well.
59
+
60
+ === Conformance
61
+
62
+ To run the Conformance suite, you'll need to add that project as a git submodule. From the root twitter-text-rb directory, run:
63
+
64
+ git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
65
+ git submodule init
66
+ git submodule update
data/Rakefile CHANGED
@@ -7,10 +7,9 @@ require 'spec/rake/spectask'
7
7
  require 'spec/rake/verify_rcov'
8
8
  require 'digest'
9
9
 
10
-
11
10
  spec = Gem::Specification.new do |s|
12
11
  s.name = "twitter-text"
13
- s.version = "1.0.1"
12
+ s.version = "1.0.2"
14
13
  s.author = "Matt Sanford"
15
14
  s.email = "matt@twitter.com"
16
15
  s.homepage = "http://twitter.com"
data/lib/extractor.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  module Twitter
3
2
  # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
4
3
  # of usernames, lists, URLs and hashtags.
@@ -13,7 +12,9 @@ module Twitter
13
12
  return [] unless text
14
13
 
15
14
  possible_screen_names = []
16
- text.scan(Twitter::Regex[:extract_mentions]) {|before,sn| possible_screen_names << sn }
15
+ text.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
16
+ possible_screen_names << sn unless after =~ Twitter::Regex[:at_signs]
17
+ end
17
18
  possible_screen_names.each{|sn| yield sn } if block_given?
18
19
  possible_screen_names
19
20
  end
@@ -39,7 +40,6 @@ module Twitter
39
40
  # If a block is given then it will be called for each URL.
40
41
  def extract_urls(text) # :yields: url
41
42
  return [] unless text
42
-
43
43
  urls = []
44
44
  text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
45
45
  urls << (protocol == "www." ? "http://#{url}" : url)
@@ -66,4 +66,4 @@ module Twitter
66
66
  end
67
67
 
68
68
  end
69
- end
69
+ end
data/lib/regex.rb CHANGED
@@ -26,8 +26,9 @@ module Twitter
26
26
  ].flatten.freeze
27
27
  REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
28
28
 
29
- REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])[@@]([a-zA-Z0-9_]{1,20})(?!@)/
30
- REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*[@@]([a-zA-Z0-9_]{1,20})/o
29
+ REGEXEN[:at_signs] = /[@@]/
30
+ REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
31
+ REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
31
32
 
32
33
  REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
33
34
 
@@ -42,9 +43,9 @@ module Twitter
42
43
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
43
44
 
44
45
  # URL related hash regex collection
45
- REGEXEN[:valid_preceeding_chars] = /(?:[^\/"':!=]|^|\:)/
46
- REGEXEN[:valid_domain] = /(?:[\.-]|[^[:punct:]])+\.[a-z]{2,}(?::[0-9]+)?/i
47
- REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
46
+ REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
47
+ REGEXEN[:valid_domain] = /(?:[\.-]|[^[:punct:]\s])+\.[a-z]{2,}(?::[0-9]+)?/i
48
+ REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~@]/i
48
49
  # Valid end-of-path chracters (so /foo. does not gobble the period).
49
50
  # 1. Allow ) for Wikipedia URLs.
50
51
  # 2. Allow =&# for empty URL parameters and other URL-join artifacts
@@ -53,7 +54,7 @@ module Twitter
53
54
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
54
55
  REGEXEN[:valid_url] = %r{
55
56
  ( # $1 total match
56
- (#{REGEXEN[:valid_preceeding_chars]}) # $2 Preceeding chracter
57
+ (#{REGEXEN[:valid_preceding_chars]}) # $2 Preceeding chracter
57
58
  ( # $3 URL
58
59
  (https?:\/\/|www\.) # $4 Protocol or beginning
59
60
  (#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
data/lib/twitter-text.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
3
2
 
4
3
  require 'rubygems'
@@ -10,4 +9,4 @@ require File.join(File.dirname(__FILE__), 'regex')
10
9
  require File.join(File.dirname(__FILE__), 'autolink')
11
10
  require File.join(File.dirname(__FILE__), 'extractor')
12
11
  require File.join(File.dirname(__FILE__), 'unicode')
13
- require File.join(File.dirname(__FILE__), 'validation')
12
+ require File.join(File.dirname(__FILE__), 'validation')
data/lib/unicode.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  module Twitter
3
2
  # This module lazily defines constants of the form Uxxxx for all Unicode
4
3
  # codepoints from U0000 to U10FFFF. The value of each constant is the
@@ -24,4 +23,4 @@ module Twitter
24
23
  end
25
24
  end
26
25
 
27
- end
26
+ end
data/lib/validation.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  module Twitter
3
2
  module Validation
4
3
  MAX_LENGTH = 140
@@ -48,4 +47,4 @@ module Twitter
48
47
  return false
49
48
  end
50
49
  end
51
- end
50
+ end
@@ -59,21 +59,21 @@ describe Twitter::Extractor do
59
59
  @extractor.extract_reply_screen_name("@alice reply text").should == "alice"
60
60
  end
61
61
 
62
- it "should extract preceeded by a space" do
62
+ it "should extract preceded by a space" do
63
63
  @extractor.extract_reply_screen_name(" @alice reply text").should == "alice"
64
64
  end
65
65
 
66
- it "should extract preceeded by a full-width space" do
66
+ it "should extract preceded by a full-width space" do
67
67
  @extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text").should == "alice"
68
68
  end
69
69
  end
70
70
 
71
71
  context "should not be extracted from" do
72
- it "should not be extracted when preceeded by text" do
72
+ it "should not be extracted when preceded by text" do
73
73
  @extractor.extract_reply_screen_name("reply @alice text").should == nil
74
74
  end
75
75
 
76
- it "should not be extracted when preceeded by puctuation" do
76
+ it "should not be extracted when preceded by puctuation" do
77
77
  %w(. / _ - + # ! @).each do |punct|
78
78
  @extractor.extract_reply_screen_name("#{punct}@alice text").should == nil
79
79
  end
@@ -99,39 +99,21 @@ describe Twitter::Extractor do
99
99
 
100
100
  describe "urls" do
101
101
  describe "matching URLS" do
102
- @urls = [
103
- "http://google.com",
104
- "http://foobar.com/#",
105
- "http://google.com/#foo",
106
- "http://google.com/#search?q=iphone%20-filter%3Alinks",
107
- "http://twitter.com/#search?q=iphone%20-filter%3Alinks",
108
- "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
109
- "http://somehost.com:3000",
110
- "http://x.com/~matthew+%-x",
111
- "http://en.wikipedia.org/wiki/Primer_(film)",
112
- "http://www.ams.org/bookstore-getitem/item=mbk-59",
113
- "http://chilp.it/?77e8fd",
114
- ]
115
-
116
- @urls.each do |url|
117
- it "should extract the URL #{url}" do
118
- @extractor.extract_urls(url).should == [url]
102
+ TestUrls::VALID.each do |url|
103
+ it "should extract the URL #{url} and prefix it with a protocol if missing" do
104
+ @extractor.extract_urls(url).first.should include(url)
119
105
  end
120
106
 
121
107
  it "should match the URL #{url} when it's embedded in other text" do
122
108
  text = "Sweet url: #{url} I found. #awesome"
123
- @extractor.extract_urls(text).should == [url]
109
+ @extractor.extract_urls(text).first.should include(url)
124
110
  end
125
111
  end
126
112
  end
127
113
 
128
114
  describe "invalid URLS" do
129
- it "does not link urls with invalid_domains" do
130
- [ "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
131
- "http://no-tld",
132
- "http://tld-too-short.x",
133
- "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
134
- ].each {|url| @extractor.extract_urls(url).should == [] }
115
+ it "does not link urls with invalid domains" do
116
+ @extractor.extract_urls("http://tld-too-short.x").should == []
135
117
  end
136
118
  end
137
119
  end
@@ -150,7 +132,6 @@ describe Twitter::Extractor do
150
132
  end
151
133
 
152
134
  context "international hashtags" do
153
-
154
135
  context "should allow accents" do
155
136
  %w(mañana café münchen).each do |hashtag|
156
137
  it "should extract ##{hashtag}" do
data/spec/regex_spec.rb CHANGED
@@ -2,27 +2,7 @@ require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  describe "Twitter::Regex regular expressions" do
4
4
  describe "matching URLS" do
5
- @urls = [
6
- "http://google.com",
7
- "http://foobar.com/#",
8
- "http://google.com/#foo",
9
- "http://google.com/#search?q=iphone%20-filter%3Alinks",
10
- "http://twitter.com/#search?q=iphone%20-filter%3Alinks",
11
- "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
12
- "http://somehost.com:3000",
13
- "http://x.com/~matthew+%-x",
14
- "http://en.wikipedia.org/wiki/Primer_(film)",
15
- "http://www.ams.org/bookstore-getitem/item=mbk-59",
16
- "http://chilp.it/?77e8fd",
17
- "www.foobar.com",
18
- "WWW.FOOBAR.COM",
19
- "http://tell.me/why",
20
- "http://longtlds.mobi",
21
- "http://✪df.ws/ejp",
22
- "http://日本.com"
23
- ]
24
-
25
- @urls.each do |url|
5
+ TestUrls::VALID.each do |url|
26
6
  it "should match the URL #{url}" do
27
7
  url.should match_autolink_expression
28
8
  end
@@ -36,19 +16,8 @@ describe "Twitter::Regex regular expressions" do
36
16
 
37
17
  describe "invalid URLS" do
38
18
  it "does not link urls with invalid characters" do
39
- [ "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
40
- "http://no-tld",
41
- "http://tld-too-short.x",
42
- "http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
43
- "http://doman_dash_2314352345_dfasd.foo-cow_4352.com",
44
- ].each {|url| url.should_not have_autolinked_url(url)}
45
- end
46
-
47
- it "does not link domains beginning with a hypen" do
48
- pending
49
- "http://-doman_dash_2314352345_dfasd.com".should_not match_autolink_expression
19
+ TestUrls::INVALID.each {|url| url.should_not have_autolinked_url(url)}
50
20
  end
51
-
52
21
  end
53
22
 
54
23
  end
data/spec/spec_helper.rb CHANGED
@@ -3,6 +3,11 @@ $:.push File.join(File.dirname(__FILE__), '..', 'lib')
3
3
 
4
4
  require 'twitter-text'
5
5
  require 'hpricot'
6
+ require 'spec/test_urls'
7
+
8
+ Spec::Runner.configure do |config|
9
+ config.include TestUrls
10
+ end
6
11
 
7
12
  Spec::Matchers.define :match_autolink_expression do
8
13
  match do |string|
@@ -81,6 +86,10 @@ Spec::Matchers.define :have_autolinked_hashtag do |hashtag|
81
86
  end
82
87
 
83
88
  failure_message_for_should do |text|
84
- "Expected hashtag #{hashtag} to be autolinked in '#{text}'"
89
+ if @link
90
+ "Expected link text to be #{hashtag}, but it was #{@link.inner_text}"
91
+ else
92
+ "Expected hashtag #{hashtag} to be autolinked in '#{text}', but no link was found."
93
+ end
85
94
  end
86
95
  end
data/spec/test_urls.rb ADDED
@@ -0,0 +1,30 @@
1
+ module TestUrls
2
+ VALID = [
3
+ "http://google.com",
4
+ "http://foobar.com/#",
5
+ "http://google.com/#foo",
6
+ "http://google.com/#search?q=iphone%20-filter%3Alinks",
7
+ "http://twitter.com/#search?q=iphone%20-filter%3Alinks",
8
+ "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
9
+ "http://somehost.com:3000",
10
+ "http://x.com/~matthew+%-x",
11
+ "http://en.wikipedia.org/wiki/Primer_(film)",
12
+ "http://www.ams.org/bookstore-getitem/item=mbk-59",
13
+ "http://chilp.it/?77e8fd",
14
+ "www.foobar.com",
15
+ "WWW.FOOBAR.COM",
16
+ "http://tell.me/why",
17
+ "http://longtlds.info",
18
+ "http://✪df.ws/ejp",
19
+ "http://日本.com"
20
+ ]
21
+
22
+ INVALID = [
23
+ "http://no-tld",
24
+ "http://tld-too-short.x",
25
+ "http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
26
+ "http://domain-dash.com",
27
+ "http://-doman_dash.com"
28
+ ]
29
+
30
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matt Sanford
@@ -9,7 +9,7 @@ autorequire: ""
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-10 00:00:00 -08:00
12
+ date: 2010-03-05 00:00:00 -08:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -45,6 +45,7 @@ files:
45
45
  - spec/extractor_spec.rb
46
46
  - spec/regex_spec.rb
47
47
  - spec/spec_helper.rb
48
+ - spec/test_urls.rb
48
49
  - spec/unicode_spec.rb
49
50
  - spec/validation_spec.rb
50
51
  has_rdoc: true