twitter-text 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +13 -0
- data/README.rdoc +58 -0
- data/Rakefile +92 -0
- data/TODO +3 -0
- data/lib/autolink.rb +101 -0
- data/lib/extractor.rb +69 -0
- data/lib/regex.rb +74 -0
- data/lib/twitter-text.rb +13 -0
- data/lib/unicode.rb +27 -0
- data/lib/validation.rb +51 -0
- data/spec/autolinking_spec.rb +427 -0
- data/spec/extractor_spec.rb +195 -0
- data/spec/regex_spec.rb +44 -0
- data/spec/spec_helper.rb +86 -0
- data/spec/unicode_spec.rb +30 -0
- data/spec/validation_spec.rb +42 -0
- metadata +79 -0
@@ -0,0 +1,195 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
class TestExtractor
|
4
|
+
include Twitter::Extractor
|
5
|
+
end
|
6
|
+
|
7
|
+
describe Twitter::Extractor do
|
8
|
+
before do
|
9
|
+
@extractor = TestExtractor.new
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "mentions" do
|
13
|
+
context "single screen name alone " do
|
14
|
+
it "should be linked" do
|
15
|
+
@extractor.extract_mentioned_screen_names("@alice").should == ["alice"]
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should be linked with _" do
|
19
|
+
@extractor.extract_mentioned_screen_names("@alice_adams").should == ["alice_adams"]
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should be linked if numeric" do
|
23
|
+
@extractor.extract_mentioned_screen_names("@1234").should == ["1234"]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
context "multiple screen names" do
|
28
|
+
it "should both be linked" do
|
29
|
+
@extractor.extract_mentioned_screen_names("@alice @bob").should == ["alice", "bob"]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "screen names embedded in text" do
|
34
|
+
it "should be linked in Latin text" do
|
35
|
+
@extractor.extract_mentioned_screen_names("waiting for @alice to arrive").should == ["alice"]
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should be linked in Japanese text" do
|
39
|
+
@extractor.extract_mentioned_screen_names("の@aliceに到着を待っている").should == ["alice"]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should accept a block arugment and call it in order" do
|
44
|
+
needed = ["alice", "bob"]
|
45
|
+
@extractor.extract_mentioned_screen_names("@alice @bob") do |sn|
|
46
|
+
sn.should == needed.shift
|
47
|
+
end
|
48
|
+
needed.should == []
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "replies" do
|
53
|
+
context "should be extracted from" do
|
54
|
+
it "should extract from lone name" do
|
55
|
+
@extractor.extract_reply_screen_name("@alice").should == "alice"
|
56
|
+
end
|
57
|
+
|
58
|
+
it "should extract from the start" do
|
59
|
+
@extractor.extract_reply_screen_name("@alice reply text").should == "alice"
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should extract preceeded by a space" do
|
63
|
+
@extractor.extract_reply_screen_name(" @alice reply text").should == "alice"
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should extract preceeded by a full-width space" do
|
67
|
+
@extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text").should == "alice"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
context "should not be extracted from" do
|
72
|
+
it "should not be extracted when preceeded by text" do
|
73
|
+
@extractor.extract_reply_screen_name("reply @alice text").should == nil
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should not be extracted when preceeded by puctuation" do
|
77
|
+
%w(. / _ - + # ! @).each do |punct|
|
78
|
+
@extractor.extract_reply_screen_name("#{punct}@alice text").should == nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
context "should accept a block arugment" do
|
84
|
+
it "should call the block on match" do
|
85
|
+
@extractor.extract_reply_screen_name("@alice") do |sn|
|
86
|
+
sn.should == "alice"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should not call the block on no match" do
|
91
|
+
calls = 0
|
92
|
+
@extractor.extract_reply_screen_name("not a reply") do |sn|
|
93
|
+
calls += 1
|
94
|
+
end
|
95
|
+
calls.should == 0
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
describe "urls" do
|
101
|
+
describe "matching URLS" do
|
102
|
+
@urls = [
|
103
|
+
"http://google.com",
|
104
|
+
"http://foobar.com/#",
|
105
|
+
"http://google.com/#foo",
|
106
|
+
"http://google.com/#search?q=iphone%20-filter%3Alinks",
|
107
|
+
"http://twitter.com/#search?q=iphone%20-filter%3Alinks",
|
108
|
+
"http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
|
109
|
+
"http://somehost.com:3000",
|
110
|
+
"http://x.com/~matthew+%-x",
|
111
|
+
"http://x.com/~matthew+%-,.;@:x",
|
112
|
+
"http://x.com/,.;@:x",
|
113
|
+
"http://en.wikipedia.org/wiki/Primer_(film)",
|
114
|
+
"http://www.ams.org/bookstore-getitem/item=mbk-59",
|
115
|
+
"http://chilp.it/?77e8fd",
|
116
|
+
]
|
117
|
+
|
118
|
+
@urls.each do |url|
|
119
|
+
it "should extract the URL #{url}" do
|
120
|
+
@extractor.extract_urls(url).should == [url]
|
121
|
+
end
|
122
|
+
|
123
|
+
it "should match the URL #{url} when it's embedded in other text" do
|
124
|
+
text = "Sweet url: #{url} I found. #awesome"
|
125
|
+
@extractor.extract_urls(text).should == [url]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
describe "invalid URLS" do
|
131
|
+
it "does not link urls with invalid_domains" do
|
132
|
+
[ "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
|
133
|
+
"http://no-tld",
|
134
|
+
"http://tld-too-short.x",
|
135
|
+
"http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
|
136
|
+
].each {|url| @extractor.extract_urls(url).should == [] }
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
describe "hashtags" do
|
142
|
+
context "extracts latin/numeric hashtags" do
|
143
|
+
%w(text text123 123text).each do |hashtag|
|
144
|
+
it "should extract ##{hashtag}" do
|
145
|
+
@extractor.extract_hashtags("##{hashtag}").should == [hashtag]
|
146
|
+
end
|
147
|
+
|
148
|
+
it "should extract ##{hashtag} within text" do
|
149
|
+
@extractor.extract_hashtags("pre-text ##{hashtag} post-text").should == [hashtag]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
context "international hashtags" do
|
155
|
+
|
156
|
+
context "should allow accents" do
|
157
|
+
%w(mañana café münchen).each do |hashtag|
|
158
|
+
it "should extract ##{hashtag}" do
|
159
|
+
@extractor.extract_hashtags("##{hashtag}").should == [hashtag]
|
160
|
+
end
|
161
|
+
|
162
|
+
it "should extract ##{hashtag} within text" do
|
163
|
+
@extractor.extract_hashtags("pre-text ##{hashtag} post-text").should == [hashtag]
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
it "should not allow the multiplication character" do
|
168
|
+
@extractor.extract_hashtags("#pre#{[0xd7].pack('U')}post").should == ["pre"]
|
169
|
+
end
|
170
|
+
|
171
|
+
it "should not allow the division character" do
|
172
|
+
@extractor.extract_hashtags("#pre#{[0xf7].pack('U')}post").should == ["pre"]
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
context "should NOT allow Japanese" do
|
177
|
+
%w(会議中 ハッシュ).each do |hashtag|
|
178
|
+
it "should NOT extract ##{hashtag}" do
|
179
|
+
@extractor.extract_hashtags("##{hashtag}").should == []
|
180
|
+
end
|
181
|
+
|
182
|
+
it "should NOT extract ##{hashtag} within text" do
|
183
|
+
@extractor.extract_hashtags("pre-text ##{hashtag} post-text").should == []
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
189
|
+
|
190
|
+
it "should not extract numeric hashtags" do
|
191
|
+
@extractor.extract_hashtags("#1234").should == []
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
data/spec/regex_spec.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe "Twitter::Regex regular expressions" do
|
4
|
+
describe "matching URLS" do
|
5
|
+
@urls = [
|
6
|
+
"http://google.com",
|
7
|
+
"http://foobar.com/#",
|
8
|
+
"http://google.com/#foo",
|
9
|
+
"http://google.com/#search?q=iphone%20-filter%3Alinks",
|
10
|
+
"http://twitter.com/#search?q=iphone%20-filter%3Alinks",
|
11
|
+
"http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
|
12
|
+
"http://somehost.com:3000",
|
13
|
+
"http://x.com/~matthew+%-x",
|
14
|
+
"http://x.com/~matthew+%-,.;@:x",
|
15
|
+
"http://x.com/,.;@:x",
|
16
|
+
"http://en.wikipedia.org/wiki/Primer_(film)",
|
17
|
+
"http://www.ams.org/bookstore-getitem/item=mbk-59",
|
18
|
+
"http://chilp.it/?77e8fd",
|
19
|
+
]
|
20
|
+
|
21
|
+
@urls.each do |url|
|
22
|
+
it "should match the URL #{url}" do
|
23
|
+
url.should match_autolink_expression
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should match the URL #{url} when it's embedded in other text" do
|
27
|
+
text = "Sweet url: #{url} I found. #awesome"
|
28
|
+
url.should match_autolink_expression_in(text)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "invalid URLS" do
|
34
|
+
it "does not link urls with invalid_domains" do
|
35
|
+
[ "http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
|
36
|
+
"http://no-tld",
|
37
|
+
"http://tld-too-short.x",
|
38
|
+
"http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
|
39
|
+
"http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
|
40
|
+
].each {|url| url.should_not have_autolinked_url(url)}
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
$TESTING=true
|
2
|
+
$:.push File.join(File.dirname(__FILE__), '..', 'lib')
|
3
|
+
|
4
|
+
require 'twitter-text'
|
5
|
+
require 'hpricot'
|
6
|
+
|
7
|
+
Spec::Matchers.define :match_autolink_expression do
|
8
|
+
match do |string|
|
9
|
+
Twitter::Regex[:valid_url].match(string)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
Spec::Matchers.define :match_autolink_expression_in do |text|
|
14
|
+
match do |url|
|
15
|
+
@match_data = Twitter::Regex[:valid_url].match(text)
|
16
|
+
@match_data && @match_data.to_s.strip == url
|
17
|
+
end
|
18
|
+
|
19
|
+
failure_message_for_should do |url|
|
20
|
+
"Expected to find url '#{url}' in text '#{text}', but the match was #{@match_data.captures}'"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
Spec::Matchers.define :have_autolinked_url do |url|
|
25
|
+
match do |text|
|
26
|
+
@link = Hpricot(text).at("a[@href='#{url}']")
|
27
|
+
@link &&
|
28
|
+
@link.inner_text &&
|
29
|
+
@link.inner_text == url
|
30
|
+
end
|
31
|
+
|
32
|
+
failure_message_for_should do |text|
|
33
|
+
"Expected url '#{url}' to be autolinked in '#{text}'"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
Spec::Matchers.define :link_to_screen_name do |screen_name|
|
38
|
+
match do |text|
|
39
|
+
@link = Hpricot(text).at("a.username")
|
40
|
+
@link && @link.inner_text == screen_name && "http://twitter.com/#{screen_name}".downcase.should == @link['href']
|
41
|
+
end
|
42
|
+
|
43
|
+
failure_message_for_should do |text|
|
44
|
+
"expected link #{@link.inner_text} with href #{@link['href']} to match screen_name #{@screen_name}, but it does not"
|
45
|
+
end
|
46
|
+
|
47
|
+
failure_message_for_should_not do |text|
|
48
|
+
"expected link #{@link.inner_text} with href #{@link['href']} not to match screen_name #{@screen_name}, but it does"
|
49
|
+
end
|
50
|
+
|
51
|
+
description do
|
52
|
+
"contain a link with the name and href pointing to the expected screen_name"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
Spec::Matchers.define :link_to_list_path do |list_path|
|
57
|
+
match do |text|
|
58
|
+
@link = Hpricot(text).at("a.list-slug")
|
59
|
+
!@link.nil? && @link.inner_text == list_path && "http://twitter.com/#{list_path}".downcase.should == @link['href']
|
60
|
+
end
|
61
|
+
|
62
|
+
failure_message_for_should do |text|
|
63
|
+
"expected link #{@link.inner_text} with href #{@link['href']} to match the list path #{list_path}, but it does not"
|
64
|
+
end
|
65
|
+
|
66
|
+
failure_message_for_should_not do |text|
|
67
|
+
"expected link #{@link.inner_text} with href #{@link['href']} not to match the list path #{@list_path}, but it does"
|
68
|
+
end
|
69
|
+
|
70
|
+
description do
|
71
|
+
"contain a link with the list title and an href pointing to the list path"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
Spec::Matchers.define :have_autolinked_hashtag do |hashtag|
|
76
|
+
match do |text|
|
77
|
+
@link = Hpricot(text).at("a[@href='http://twitter.com/search?q=#{CGI.escape hashtag}']")
|
78
|
+
@link &&
|
79
|
+
@link.inner_text &&
|
80
|
+
@link.inner_text == hashtag
|
81
|
+
end
|
82
|
+
|
83
|
+
failure_message_for_should do |text|
|
84
|
+
"Expected hashtag #{hashtag} to be autolinked in '#{text}'"
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe Twitter::Unicode do
|
4
|
+
|
5
|
+
it "should lazy-init constants" do
|
6
|
+
Twitter::Unicode.const_defined?(:UFEB6).should == false
|
7
|
+
Twitter::Unicode::UFEB6.should_not be_nil
|
8
|
+
Twitter::Unicode::UFEB6.should be_kind_of(String)
|
9
|
+
Twitter::Unicode.const_defined?(:UFEB6).should == true
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should return corresponding character" do
|
13
|
+
Twitter::Unicode::UFEB6.should == [0xfeb6].pack('U')
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should allow lowercase notation" do
|
17
|
+
Twitter::Unicode::Ufeb6.should == Twitter::Unicode::UFEB6
|
18
|
+
Twitter::Unicode::Ufeb6.should === Twitter::Unicode::UFEB6
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should allow underscore notation" do
|
22
|
+
Twitter::Unicode::U_FEB6.should == Twitter::Unicode::UFEB6
|
23
|
+
Twitter::Unicode::U_FEB6.should === Twitter::Unicode::UFEB6
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should raise on invalid codepoints" do
|
27
|
+
lambda { Twitter::Unicode::FFFFFF }.should raise_error(NameError)
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
class TestValidation
|
4
|
+
include Twitter::Validation
|
5
|
+
end
|
6
|
+
|
7
|
+
describe Twitter::Validation do
|
8
|
+
|
9
|
+
it "should disallow invalid BOM character" do
|
10
|
+
TestValidation.new.tweet_invalid?("Bom:#{Twitter::Unicode::UFFFE}").should == :invalid_characters
|
11
|
+
TestValidation.new.tweet_invalid?("Bom:#{Twitter::Unicode::UFEFF}").should == :invalid_characters
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should disallow invalid U+FFFF character" do
|
15
|
+
TestValidation.new.tweet_invalid?("Bom:#{Twitter::Unicode::UFFFF}").should == :invalid_characters
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should disallow direction change characters" do
|
19
|
+
[0x202A, 0x202B, 0x202C, 0x202D, 0x202E].map{|cp| [cp].pack('U') }.each do |char|
|
20
|
+
TestValidation.new.tweet_invalid?("Invalid:#{char}").should == :invalid_characters
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should disallow non-Unicode" do
|
25
|
+
TestValidation.new.tweet_invalid?("not-Unicode:\xfff0").should == :invalid_characters
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should allow <= 140 combined accent characters" do
|
29
|
+
char = [0x65, 0x0301].pack('U')
|
30
|
+
TestValidation.new.tweet_invalid?(char * 139).should == false
|
31
|
+
TestValidation.new.tweet_invalid?(char * 140).should == false
|
32
|
+
TestValidation.new.tweet_invalid?(char * 141).should == :too_long
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should allow <= 140 multi-byte characters" do
|
36
|
+
char = [ 0x1d106 ].pack('U')
|
37
|
+
TestValidation.new.tweet_invalid?(char * 139).should == false
|
38
|
+
TestValidation.new.tweet_invalid?(char * 140).should == false
|
39
|
+
TestValidation.new.tweet_invalid?(char * 141).should == :too_long
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
metadata
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twitter-text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "1.0"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Matt Sanford
|
8
|
+
autorequire: ""
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-01-27 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: action_view
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
description: A gem that provides text handling for Twitter
|
26
|
+
email: matt@twitter.com
|
27
|
+
executables: []
|
28
|
+
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files: []
|
32
|
+
|
33
|
+
files:
|
34
|
+
- LICENSE
|
35
|
+
- README.rdoc
|
36
|
+
- Rakefile
|
37
|
+
- TODO
|
38
|
+
- lib/autolink.rb
|
39
|
+
- lib/extractor.rb
|
40
|
+
- lib/regex.rb
|
41
|
+
- lib/twitter-text.rb
|
42
|
+
- lib/unicode.rb
|
43
|
+
- lib/validation.rb
|
44
|
+
- spec/autolinking_spec.rb
|
45
|
+
- spec/extractor_spec.rb
|
46
|
+
- spec/regex_spec.rb
|
47
|
+
- spec/spec_helper.rb
|
48
|
+
- spec/unicode_spec.rb
|
49
|
+
- spec/validation_spec.rb
|
50
|
+
has_rdoc: true
|
51
|
+
homepage: http://twitter.com
|
52
|
+
licenses: []
|
53
|
+
|
54
|
+
post_install_message:
|
55
|
+
rdoc_options: []
|
56
|
+
|
57
|
+
require_paths:
|
58
|
+
- lib
|
59
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: "0"
|
64
|
+
version:
|
65
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: "0"
|
70
|
+
version:
|
71
|
+
requirements: []
|
72
|
+
|
73
|
+
rubyforge_project:
|
74
|
+
rubygems_version: 1.3.5
|
75
|
+
signing_key:
|
76
|
+
specification_version: 3
|
77
|
+
summary: Twitter text handling library
|
78
|
+
test_files: []
|
79
|
+
|