twitter-text 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -56,3 +56,11 @@ between words.
56
56
  Special care has been taken to be sure that auto-linking and extraction work
57
57
  in Tweets of all languages. This means that languages without spaces between
58
58
  words should work equally well.
59
+
60
+ === Conformance
61
+
62
+ To run the Conformance suite, you'll need to add that project as a git submodule. From the root twitter-text-rb directory, run:
63
+
64
+ git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
65
+ git submodule init
66
+ git submodule update
data/Rakefile CHANGED
@@ -7,10 +7,9 @@ require 'spec/rake/spectask'
7
7
  require 'spec/rake/verify_rcov'
8
8
  require 'digest'
9
9
 
10
-
11
10
  spec = Gem::Specification.new do |s|
12
11
  s.name = "twitter-text"
13
- s.version = "1.0.1"
12
+ s.version = "1.0.2"
14
13
  s.author = "Matt Sanford"
15
14
  s.email = "matt@twitter.com"
16
15
  s.homepage = "http://twitter.com"
data/lib/extractor.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  module Twitter
3
2
  # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
4
3
  # of usernames, lists, URLs and hashtags.
@@ -13,7 +12,9 @@ module Twitter
13
12
  return [] unless text
14
13
 
15
14
  possible_screen_names = []
16
- text.scan(Twitter::Regex[:extract_mentions]) {|before,sn| possible_screen_names << sn }
15
+ text.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
16
+ possible_screen_names << sn unless after =~ Twitter::Regex[:at_signs]
17
+ end
17
18
  possible_screen_names.each{|sn| yield sn } if block_given?
18
19
  possible_screen_names
19
20
  end
@@ -39,7 +40,6 @@ module Twitter
39
40
  # If a block is given then it will be called for each URL.
40
41
  def extract_urls(text) # :yields: url
41
42
  return [] unless text
42
-
43
43
  urls = []
44
44
  text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
45
45
  urls << (protocol == "www." ? "http://#{url}" : url)
@@ -66,4 +66,4 @@ module Twitter
66
66
  end
67
67
 
68
68
  end
69
- end
69
+ end
data/lib/regex.rb CHANGED
@@ -26,8 +26,9 @@ module Twitter
26
26
  ].flatten.freeze
27
27
  REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
28
28
 
29
- REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])[@@]([a-zA-Z0-9_]{1,20})(?!@)/
30
- REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*[@@]([a-zA-Z0-9_]{1,20})/o
29
+ REGEXEN[:at_signs] = /[@@]/
30
+ REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
31
+ REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
31
32
 
32
33
  REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
33
34
 
@@ -42,9 +43,9 @@ module Twitter
42
43
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
43
44
 
44
45
  # URL related hash regex collection
45
- REGEXEN[:valid_preceeding_chars] = /(?:[^\/"':!=]|^|\:)/
46
- REGEXEN[:valid_domain] = /(?:[\.-]|[^[:punct:]])+\.[a-z]{2,}(?::[0-9]+)?/i
47
- REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
46
+ REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
47
+ REGEXEN[:valid_domain] = /(?:[\.-]|[^[:punct:]\s])+\.[a-z]{2,}(?::[0-9]+)?/i
48
+ REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~@]/i
48
49
  # Valid end-of-path chracters (so /foo. does not gobble the period).
49
50
  # 1. Allow ) for Wikipedia URLs.
50
51
  # 2. Allow =&# for empty URL parameters and other URL-join artifacts
@@ -53,7 +54,7 @@ module Twitter
53
54
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
54
55
  REGEXEN[:valid_url] = %r{
55
56
  ( # $1 total match
56
- (#{REGEXEN[:valid_preceeding_chars]}) # $2 Preceeding chracter
57
+ (#{REGEXEN[:valid_preceding_chars]}) # $2 Preceeding chracter
57
58
  ( # $3 URL
58
59
  (https?:\/\/|www\.) # $4 Protocol or beginning
59
60
  (#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
data/lib/twitter-text.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
3
2
 
4
3
  require 'rubygems'
@@ -10,4 +9,4 @@ require File.join(File.dirname(__FILE__), 'regex')
10
9
  require File.join(File.dirname(__FILE__), 'autolink')
11
10
  require File.join(File.dirname(__FILE__), 'extractor')
12
11
  require File.join(File.dirname(__FILE__), 'unicode')
13
- require File.join(File.dirname(__FILE__), 'validation')
12
+ require File.join(File.dirname(__FILE__), 'validation')
data/lib/unicode.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  module Twitter
3
2
  # This module lazily defines constants of the form Uxxxx for all Unicode
4
3
  # codepoints from U0000 to U10FFFF. The value of each constant is the
@@ -24,4 +23,4 @@ module Twitter
24
23
  end
25
24
  end
26
25
 
27
- end
26
+ end
data/lib/validation.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  module Twitter
3
2
  module Validation
4
3
  MAX_LENGTH = 140
@@ -48,4 +47,4 @@ module Twitter
48
47
  return false
49
48
  end
50
49
  end
51
- end
50
+ end
@@ -59,21 +59,21 @@ describe Twitter::Extractor do
59
59
  @extractor.extract_reply_screen_name("@alice reply text").should == "alice"
60
60
  end
61
61
 
62
- it "should extract preceeded by a space" do
62
+ it "should extract preceded by a space" do
63
63
  @extractor.extract_reply_screen_name(" @alice reply text").should == "alice"
64
64
  end
65
65
 
66
- it "should extract preceeded by a full-width space" do
66
+ it "should extract preceded by a full-width space" do
67
67
  @extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text").should == "alice"
68
68
  end
69
69
  end
70
70
 
71
71
  context "should not be extracted from" do
72
- it "should not be extracted when preceeded by text" do
72
+ it "should not be extracted when preceded by text" do
73
73
  @extractor.extract_reply_screen_name("reply @alice text").should == nil
74
74
  end
75
75
 
76
- it "should not be extracted when preceeded by puctuation" do
76
+ it "should not be extracted when preceded by puctuation" do
77
77
  %w(. / _ - + # ! @).each do |punct|
78
78
  @extractor.extract_reply_screen_name("#{punct}@alice text").should == nil
79
79
  end
@@ -99,39 +99,21 @@ describe Twitter::Extractor do
99
99
 
100
100
  describe "urls" do
101
101
  describe "matching URLS" do
102
- @urls = [
103
- "http://google.com",
104
- "http://foobar.com/#",
105
- "http://google.com/#foo",
106
- "http://google.com/#search?q=iphone%20-filter%3Alinks",
107
- "http://twitter.com/#search?q=iphone%20-filter%3Alinks",
108
- "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
109
- "http://somehost.com:3000",
110
- "http://x.com/~matthew+%-x",
111
- "http://en.wikipedia.org/wiki/Primer_(film)",
112
- "http://www.ams.org/bookstore-getitem/item=mbk-59",
113
- "http://chilp.it/?77e8fd",
114
- ]
115
-
116
- @urls.each do |url|
117
- it "should extract the URL #{url}" do
118
- @extractor.extract_urls(url).should == [url]
102
+ TestUrls::VALID.each do |url|
103
+ it "should extract the URL #{url} and prefix it with a protocol if missing" do
104
+ @extractor.extract_urls(url).first.should include(url)
119
105
  end
120
106
 
121
107
  it "should match the URL #{url} when it's embedded in other text" do
122
108
  text = "Sweet url: #{url} I found. #awesome"
123
- @extractor.extract_urls(text).should == [url]
109
+ @extractor.extract_urls(text).first.should include(url)
124
110
  end
125
111
  end
126
112
  end
127
113
 
128
114
  describe "invalid URLS" do
129
- it "does not link urls with invalid_domains" do
130
- [ "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
131
- "http://no-tld",
132
- "http://tld-too-short.x",
133
- "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
134
- ].each {|url| @extractor.extract_urls(url).should == [] }
115
+ it "does not link urls with invalid domains" do
116
+ @extractor.extract_urls("http://tld-too-short.x").should == []
135
117
  end
136
118
  end
137
119
  end
@@ -150,7 +132,6 @@ describe Twitter::Extractor do
150
132
  end
151
133
 
152
134
  context "international hashtags" do
153
-
154
135
  context "should allow accents" do
155
136
  %w(mañana café münchen).each do |hashtag|
156
137
  it "should extract ##{hashtag}" do
data/spec/regex_spec.rb CHANGED
@@ -2,27 +2,7 @@ require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  describe "Twitter::Regex regular expressions" do
4
4
  describe "matching URLS" do
5
- @urls = [
6
- "http://google.com",
7
- "http://foobar.com/#",
8
- "http://google.com/#foo",
9
- "http://google.com/#search?q=iphone%20-filter%3Alinks",
10
- "http://twitter.com/#search?q=iphone%20-filter%3Alinks",
11
- "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
12
- "http://somehost.com:3000",
13
- "http://x.com/~matthew+%-x",
14
- "http://en.wikipedia.org/wiki/Primer_(film)",
15
- "http://www.ams.org/bookstore-getitem/item=mbk-59",
16
- "http://chilp.it/?77e8fd",
17
- "www.foobar.com",
18
- "WWW.FOOBAR.COM",
19
- "http://tell.me/why",
20
- "http://longtlds.mobi",
21
- "http://✪df.ws/ejp",
22
- "http://日本.com"
23
- ]
24
-
25
- @urls.each do |url|
5
+ TestUrls::VALID.each do |url|
26
6
  it "should match the URL #{url}" do
27
7
  url.should match_autolink_expression
28
8
  end
@@ -36,19 +16,8 @@ describe "Twitter::Regex regular expressions" do
36
16
 
37
17
  describe "invalid URLS" do
38
18
  it "does not link urls with invalid characters" do
39
- [ "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
40
- "http://no-tld",
41
- "http://tld-too-short.x",
42
- "http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
43
- "http://doman_dash_2314352345_dfasd.foo-cow_4352.com",
44
- ].each {|url| url.should_not have_autolinked_url(url)}
45
- end
46
-
47
- it "does not link domains beginning with a hypen" do
48
- pending
49
- "http://-doman_dash_2314352345_dfasd.com".should_not match_autolink_expression
19
+ TestUrls::INVALID.each {|url| url.should_not have_autolinked_url(url)}
50
20
  end
51
-
52
21
  end
53
22
 
54
23
  end
data/spec/spec_helper.rb CHANGED
@@ -3,6 +3,11 @@ $:.push File.join(File.dirname(__FILE__), '..', 'lib')
3
3
 
4
4
  require 'twitter-text'
5
5
  require 'hpricot'
6
+ require 'spec/test_urls'
7
+
8
+ Spec::Runner.configure do |config|
9
+ config.include TestUrls
10
+ end
6
11
 
7
12
  Spec::Matchers.define :match_autolink_expression do
8
13
  match do |string|
@@ -81,6 +86,10 @@ Spec::Matchers.define :have_autolinked_hashtag do |hashtag|
81
86
  end
82
87
 
83
88
  failure_message_for_should do |text|
84
- "Expected hashtag #{hashtag} to be autolinked in '#{text}'"
89
+ if @link
90
+ "Expected link text to be #{hashtag}, but it was #{@link.inner_text}"
91
+ else
92
+ "Expected hashtag #{hashtag} to be autolinked in '#{text}', but no link was found."
93
+ end
85
94
  end
86
95
  end
data/spec/test_urls.rb ADDED
@@ -0,0 +1,30 @@
1
+ module TestUrls
2
+ VALID = [
3
+ "http://google.com",
4
+ "http://foobar.com/#",
5
+ "http://google.com/#foo",
6
+ "http://google.com/#search?q=iphone%20-filter%3Alinks",
7
+ "http://twitter.com/#search?q=iphone%20-filter%3Alinks",
8
+ "http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
9
+ "http://somehost.com:3000",
10
+ "http://x.com/~matthew+%-x",
11
+ "http://en.wikipedia.org/wiki/Primer_(film)",
12
+ "http://www.ams.org/bookstore-getitem/item=mbk-59",
13
+ "http://chilp.it/?77e8fd",
14
+ "www.foobar.com",
15
+ "WWW.FOOBAR.COM",
16
+ "http://tell.me/why",
17
+ "http://longtlds.info",
18
+ "http://✪df.ws/ejp",
19
+ "http://日本.com"
20
+ ]
21
+
22
+ INVALID = [
23
+ "http://no-tld",
24
+ "http://tld-too-short.x",
25
+ "http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
26
+ "http://domain-dash.com",
27
+ "http://-doman_dash.com"
28
+ ]
29
+
30
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matt Sanford
@@ -9,7 +9,7 @@ autorequire: ""
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-02-10 00:00:00 -08:00
12
+ date: 2010-03-05 00:00:00 -08:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -45,6 +45,7 @@ files:
45
45
  - spec/extractor_spec.rb
46
46
  - spec/regex_spec.rb
47
47
  - spec/spec_helper.rb
48
+ - spec/test_urls.rb
48
49
  - spec/unicode_spec.rb
49
50
  - spec/validation_spec.rb
50
51
  has_rdoc: true