twitter-text 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +13 -0
- data/README.rdoc +58 -0
- data/Rakefile +92 -0
- data/TODO +3 -0
- data/lib/autolink.rb +101 -0
- data/lib/extractor.rb +69 -0
- data/lib/regex.rb +74 -0
- data/lib/twitter-text.rb +13 -0
- data/lib/unicode.rb +27 -0
- data/lib/validation.rb +51 -0
- data/spec/autolinking_spec.rb +427 -0
- data/spec/extractor_spec.rb +195 -0
- data/spec/regex_spec.rb +44 -0
- data/spec/spec_helper.rb +86 -0
- data/spec/unicode_spec.rb +30 -0
- data/spec/validation_spec.rb +42 -0
- metadata +79 -0
@@ -0,0 +1,195 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
class TestExtractor
|
4
|
+
include Twitter::Extractor
|
5
|
+
end
|
6
|
+
|
7
|
+
describe Twitter::Extractor do
|
8
|
+
before do
|
9
|
+
@extractor = TestExtractor.new
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "mentions" do
|
13
|
+
context "single screen name alone " do
|
14
|
+
it "should be linked" do
|
15
|
+
@extractor.extract_mentioned_screen_names("@alice").should == ["alice"]
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should be linked with _" do
|
19
|
+
@extractor.extract_mentioned_screen_names("@alice_adams").should == ["alice_adams"]
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should be linked if numeric" do
|
23
|
+
@extractor.extract_mentioned_screen_names("@1234").should == ["1234"]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
context "multiple screen names" do
|
28
|
+
it "should both be linked" do
|
29
|
+
@extractor.extract_mentioned_screen_names("@alice @bob").should == ["alice", "bob"]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "screen names embedded in text" do
|
34
|
+
it "should be linked in Latin text" do
|
35
|
+
@extractor.extract_mentioned_screen_names("waiting for @alice to arrive").should == ["alice"]
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should be linked in Japanese text" do
|
39
|
+
@extractor.extract_mentioned_screen_names("の@aliceに到着を待っている").should == ["alice"]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should accept a block arugment and call it in order" do
|
44
|
+
needed = ["alice", "bob"]
|
45
|
+
@extractor.extract_mentioned_screen_names("@alice @bob") do |sn|
|
46
|
+
sn.should == needed.shift
|
47
|
+
end
|
48
|
+
needed.should == []
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "replies" do
|
53
|
+
context "should be extracted from" do
|
54
|
+
it "should extract from lone name" do
|
55
|
+
@extractor.extract_reply_screen_name("@alice").should == "alice"
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should extract from the start" do
|
59
|
+
@extractor.extract_reply_screen_name("@alice reply text").should == "alice"
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should extract preceeded by a space" do
|
63
|
+
@extractor.extract_reply_screen_name(" @alice reply text").should == "alice"
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should extract preceeded by a full-width space" do
|
67
|
+
@extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text").should == "alice"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
context "should not be extracted from" do
|
72
|
+
it "should not be extracted when preceeded by text" do
|
73
|
+
@extractor.extract_reply_screen_name("reply @alice text").should == nil
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should not be extracted when preceeded by puctuation" do
|
77
|
+
%w(. / _ - + # ! @).each do |punct|
|
78
|
+
@extractor.extract_reply_screen_name("#{punct}@alice text").should == nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
context "should accept a block arugment" do
|
84
|
+
it "should call the block on match" do
|
85
|
+
@extractor.extract_reply_screen_name("@alice") do |sn|
|
86
|
+
sn.should == "alice"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should not call the block on no match" do
|
91
|
+
calls = 0
|
92
|
+
@extractor.extract_reply_screen_name("not a reply") do |sn|
|
93
|
+
calls += 1
|
94
|
+
end
|
95
|
+
calls.should == 0
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
describe "urls" do
|
101
|
+
describe "matching URLS" do
|
102
|
+
@urls = [
|
103
|
+
"http://google.com",
|
104
|
+
"http://foobar.com/#",
|
105
|
+
"http://google.com/#foo",
|
106
|
+
"http://google.com/#search?q=iphone%20-filter%3Alinks",
|
107
|
+
"http://twitter.com/#search?q=iphone%20-filter%3Alinks",
|
108
|
+
"http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
|
109
|
+
"http://somehost.com:3000",
|
110
|
+
"http://x.com/~matthew+%-x",
|
111
|
+
"http://x.com/~matthew+%-,.;@:x",
|
112
|
+
"http://x.com/,.;@:x",
|
113
|
+
"http://en.wikipedia.org/wiki/Primer_(film)",
|
114
|
+
"http://www.ams.org/bookstore-getitem/item=mbk-59",
|
115
|
+
"http://chilp.it/?77e8fd",
|
116
|
+
]
|
117
|
+
|
118
|
+
@urls.each do |url|
|
119
|
+
it "should extract the URL #{url}" do
|
120
|
+
@extractor.extract_urls(url).should == [url]
|
121
|
+
end
|
122
|
+
|
123
|
+
it "should match the URL #{url} when it's embedded in other text" do
|
124
|
+
text = "Sweet url: #{url} I found. #awesome"
|
125
|
+
@extractor.extract_urls(text).should == [url]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
describe "invalid URLS" do
|
131
|
+
it "does not link urls with invalid_domains" do
|
132
|
+
[ "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
|
133
|
+
"http://no-tld",
|
134
|
+
"http://tld-too-short.x",
|
135
|
+
"http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
|
136
|
+
].each {|url| @extractor.extract_urls(url).should == [] }
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
describe "hashtags" do
|
142
|
+
context "extracts latin/numeric hashtags" do
|
143
|
+
%w(text text123 123text).each do |hashtag|
|
144
|
+
it "should extract ##{hashtag}" do
|
145
|
+
@extractor.extract_hashtags("##{hashtag}").should == [hashtag]
|
146
|
+
end
|
147
|
+
|
148
|
+
it "should extract ##{hashtag} within text" do
|
149
|
+
@extractor.extract_hashtags("pre-text ##{hashtag} post-text").should == [hashtag]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
context "international hashtags" do
|
155
|
+
|
156
|
+
context "should allow accents" do
|
157
|
+
%w(mañana café münchen).each do |hashtag|
|
158
|
+
it "should extract ##{hashtag}" do
|
159
|
+
@extractor.extract_hashtags("##{hashtag}").should == [hashtag]
|
160
|
+
end
|
161
|
+
|
162
|
+
it "should extract ##{hashtag} within text" do
|
163
|
+
@extractor.extract_hashtags("pre-text ##{hashtag} post-text").should == [hashtag]
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
it "should not allow the multiplication character" do
|
168
|
+
@extractor.extract_hashtags("#pre#{[0xd7].pack('U')}post").should == ["pre"]
|
169
|
+
end
|
170
|
+
|
171
|
+
it "should not allow the division character" do
|
172
|
+
@extractor.extract_hashtags("#pre#{[0xf7].pack('U')}post").should == ["pre"]
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
context "should NOT allow Japanese" do
|
177
|
+
%w(会議中 ハッシュ).each do |hashtag|
|
178
|
+
it "should NOT extract ##{hashtag}" do
|
179
|
+
@extractor.extract_hashtags("##{hashtag}").should == []
|
180
|
+
end
|
181
|
+
|
182
|
+
it "should NOT extract ##{hashtag} within text" do
|
183
|
+
@extractor.extract_hashtags("pre-text ##{hashtag} post-text").should == []
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
189
|
+
|
190
|
+
it "should not extract numeric hashtags" do
|
191
|
+
@extractor.extract_hashtags("#1234").should == []
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
data/spec/regex_spec.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe "Twitter::Regex regular expressions" do
|
4
|
+
describe "matching URLS" do
|
5
|
+
@urls = [
|
6
|
+
"http://google.com",
|
7
|
+
"http://foobar.com/#",
|
8
|
+
"http://google.com/#foo",
|
9
|
+
"http://google.com/#search?q=iphone%20-filter%3Alinks",
|
10
|
+
"http://twitter.com/#search?q=iphone%20-filter%3Alinks",
|
11
|
+
"http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
|
12
|
+
"http://somehost.com:3000",
|
13
|
+
"http://x.com/~matthew+%-x",
|
14
|
+
"http://x.com/~matthew+%-,.;@:x",
|
15
|
+
"http://x.com/,.;@:x",
|
16
|
+
"http://en.wikipedia.org/wiki/Primer_(film)",
|
17
|
+
"http://www.ams.org/bookstore-getitem/item=mbk-59",
|
18
|
+
"http://chilp.it/?77e8fd",
|
19
|
+
]
|
20
|
+
|
21
|
+
@urls.each do |url|
|
22
|
+
it "should match the URL #{url}" do
|
23
|
+
url.should match_autolink_expression
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should match the URL #{url} when it's embedded in other text" do
|
27
|
+
text = "Sweet url: #{url} I found. #awesome"
|
28
|
+
url.should match_autolink_expression_in(text)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "invalid URLS" do
|
34
|
+
it "does not link urls with invalid_domains" do
|
35
|
+
[ "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
|
36
|
+
"http://no-tld",
|
37
|
+
"http://tld-too-short.x",
|
38
|
+
"http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
|
39
|
+
"http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
|
40
|
+
].each {|url| url.should_not have_autolinked_url(url)}
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
$TESTING=true
|
2
|
+
$:.push File.join(File.dirname(__FILE__), '..', 'lib')
|
3
|
+
|
4
|
+
require 'twitter-text'
|
5
|
+
require 'hpricot'
|
6
|
+
|
7
|
+
Spec::Matchers.define :match_autolink_expression do
|
8
|
+
match do |string|
|
9
|
+
Twitter::Regex[:valid_url].match(string)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
Spec::Matchers.define :match_autolink_expression_in do |text|
|
14
|
+
match do |url|
|
15
|
+
@match_data = Twitter::Regex[:valid_url].match(text)
|
16
|
+
@match_data && @match_data.to_s.strip == url
|
17
|
+
end
|
18
|
+
|
19
|
+
failure_message_for_should do |url|
|
20
|
+
"Expected to find url '#{url}' in text '#{text}', but the match was #{@match_data.captures}'"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
Spec::Matchers.define :have_autolinked_url do |url|
|
25
|
+
match do |text|
|
26
|
+
@link = Hpricot(text).at("a[@href='#{url}']")
|
27
|
+
@link &&
|
28
|
+
@link.inner_text &&
|
29
|
+
@link.inner_text == url
|
30
|
+
end
|
31
|
+
|
32
|
+
failure_message_for_should do |text|
|
33
|
+
"Expected url '#{url}' to be autolinked in '#{text}'"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
Spec::Matchers.define :link_to_screen_name do |screen_name|
|
38
|
+
match do |text|
|
39
|
+
@link = Hpricot(text).at("a.username")
|
40
|
+
@link && @link.inner_text == screen_name && "http://twitter.com/#{screen_name}".downcase.should == @link['href']
|
41
|
+
end
|
42
|
+
|
43
|
+
failure_message_for_should do |text|
|
44
|
+
"expected link #{@link.inner_text} with href #{@link['href']} to match screen_name #{@screen_name}, but it does not"
|
45
|
+
end
|
46
|
+
|
47
|
+
failure_message_for_should_not do |text|
|
48
|
+
"expected link #{@link.inner_text} with href #{@link['href']} not to match screen_name #{@screen_name}, but it does"
|
49
|
+
end
|
50
|
+
|
51
|
+
description do
|
52
|
+
"contain a link with the name and href pointing to the expected screen_name"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
Spec::Matchers.define :link_to_list_path do |list_path|
|
57
|
+
match do |text|
|
58
|
+
@link = Hpricot(text).at("a.list-slug")
|
59
|
+
!@link.nil? && @link.inner_text == list_path && "http://twitter.com/#{list_path}".downcase.should == @link['href']
|
60
|
+
end
|
61
|
+
|
62
|
+
failure_message_for_should do |text|
|
63
|
+
"expected link #{@link.inner_text} with href #{@link['href']} to match the list path #{list_path}, but it does not"
|
64
|
+
end
|
65
|
+
|
66
|
+
failure_message_for_should_not do |text|
|
67
|
+
"expected link #{@link.inner_text} with href #{@link['href']} not to match the list path #{@list_path}, but it does"
|
68
|
+
end
|
69
|
+
|
70
|
+
description do
|
71
|
+
"contain a link with the list title and an href pointing to the list path"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
Spec::Matchers.define :have_autolinked_hashtag do |hashtag|
|
76
|
+
match do |text|
|
77
|
+
@link = Hpricot(text).at("a[@href='http://twitter.com/search?q=#{CGI.escape hashtag}']")
|
78
|
+
@link &&
|
79
|
+
@link.inner_text &&
|
80
|
+
@link.inner_text == hashtag
|
81
|
+
end
|
82
|
+
|
83
|
+
failure_message_for_should do |text|
|
84
|
+
"Expected hashtag #{hashtag} to be autolinked in '#{text}'"
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe Twitter::Unicode do
|
4
|
+
|
5
|
+
it "should lazy-init constants" do
|
6
|
+
Twitter::Unicode.const_defined?(:UFEB6).should == false
|
7
|
+
Twitter::Unicode::UFEB6.should_not be_nil
|
8
|
+
Twitter::Unicode::UFEB6.should be_kind_of(String)
|
9
|
+
Twitter::Unicode.const_defined?(:UFEB6).should == true
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return corresponding character" do
|
13
|
+
Twitter::Unicode::UFEB6.should == [0xfeb6].pack('U')
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should allow lowercase notation" do
|
17
|
+
Twitter::Unicode::Ufeb6.should == Twitter::Unicode::UFEB6
|
18
|
+
Twitter::Unicode::Ufeb6.should === Twitter::Unicode::UFEB6
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should allow underscore notation" do
|
22
|
+
Twitter::Unicode::U_FEB6.should == Twitter::Unicode::UFEB6
|
23
|
+
Twitter::Unicode::U_FEB6.should === Twitter::Unicode::UFEB6
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should raise on invalid codepoints" do
|
27
|
+
lambda { Twitter::Unicode::FFFFFF }.should raise_error(NameError)
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
class TestValidation
|
4
|
+
include Twitter::Validation
|
5
|
+
end
|
6
|
+
|
7
|
+
describe Twitter::Validation do
|
8
|
+
|
9
|
+
it "should disallow invalid BOM character" do
|
10
|
+
TestValidation.new.tweet_invalid?("Bom:#{Twitter::Unicode::UFFFE}").should == :invalid_characters
|
11
|
+
TestValidation.new.tweet_invalid?("Bom:#{Twitter::Unicode::UFEFF}").should == :invalid_characters
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should disallow invalid U+FFFF character" do
|
15
|
+
TestValidation.new.tweet_invalid?("Bom:#{Twitter::Unicode::UFFFF}").should == :invalid_characters
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should disallow direction change characters" do
|
19
|
+
[0x202A, 0x202B, 0x202C, 0x202D, 0x202E].map{|cp| [cp].pack('U') }.each do |char|
|
20
|
+
TestValidation.new.tweet_invalid?("Invalid:#{char}").should == :invalid_characters
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should disallow non-Unicode" do
|
25
|
+
TestValidation.new.tweet_invalid?("not-Unicode:\xfff0").should == :invalid_characters
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should allow <= 140 combined accent characters" do
|
29
|
+
char = [0x65, 0x0301].pack('U')
|
30
|
+
TestValidation.new.tweet_invalid?(char * 139).should == false
|
31
|
+
TestValidation.new.tweet_invalid?(char * 140).should == false
|
32
|
+
TestValidation.new.tweet_invalid?(char * 141).should == :too_long
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should allow <= 140 multi-byte characters" do
|
36
|
+
char = [ 0x1d106 ].pack('U')
|
37
|
+
TestValidation.new.tweet_invalid?(char * 139).should == false
|
38
|
+
TestValidation.new.tweet_invalid?(char * 140).should == false
|
39
|
+
TestValidation.new.tweet_invalid?(char * 141).should == :too_long
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
metadata
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twitter-text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "1.0"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Matt Sanford
|
8
|
+
autorequire: ""
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-27 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: action_view
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
description: A gem that provides text handling for Twitter
|
26
|
+
email: matt@twitter.com
|
27
|
+
executables: []
|
28
|
+
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files: []
|
32
|
+
|
33
|
+
files:
|
34
|
+
- LICENSE
|
35
|
+
- README.rdoc
|
36
|
+
- Rakefile
|
37
|
+
- TODO
|
38
|
+
- lib/autolink.rb
|
39
|
+
- lib/extractor.rb
|
40
|
+
- lib/regex.rb
|
41
|
+
- lib/twitter-text.rb
|
42
|
+
- lib/unicode.rb
|
43
|
+
- lib/validation.rb
|
44
|
+
- spec/autolinking_spec.rb
|
45
|
+
- spec/extractor_spec.rb
|
46
|
+
- spec/regex_spec.rb
|
47
|
+
- spec/spec_helper.rb
|
48
|
+
- spec/unicode_spec.rb
|
49
|
+
- spec/validation_spec.rb
|
50
|
+
has_rdoc: true
|
51
|
+
homepage: http://twitter.com
|
52
|
+
licenses: []
|
53
|
+
|
54
|
+
post_install_message:
|
55
|
+
rdoc_options: []
|
56
|
+
|
57
|
+
require_paths:
|
58
|
+
- lib
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: "0"
|
64
|
+
version:
|
65
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: "0"
|
70
|
+
version:
|
71
|
+
requirements: []
|
72
|
+
|
73
|
+
rubyforge_project:
|
74
|
+
rubygems_version: 1.3.5
|
75
|
+
signing_key:
|
76
|
+
specification_version: 3
|
77
|
+
summary: Twitter text handling library
|
78
|
+
test_files: []
|
79
|
+
|