twitter-text 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +8 -0
- data/Rakefile +1 -2
- data/lib/extractor.rb +4 -4
- data/lib/regex.rb +7 -6
- data/lib/twitter-text.rb +1 -2
- data/lib/unicode.rb +1 -2
- data/lib/validation.rb +1 -2
- data/spec/extractor_spec.rb +10 -29
- data/spec/regex_spec.rb +2 -33
- data/spec/spec_helper.rb +10 -1
- data/spec/test_urls.rb +30 -0
- metadata +3 -2
data/README.rdoc
CHANGED
@@ -56,3 +56,11 @@ between words.
|
|
56
56
|
Special care has been taken to be sure that auto-linking and extraction work
|
57
57
|
in Tweets of all languages. This means that languages without spaces between
|
58
58
|
words should work equally well.
|
59
|
+
|
60
|
+
=== Conformance
|
61
|
+
|
62
|
+
To run the Conformance suite, you'll need to add that project as a git submodule. From the root twitter-text-rb directory, run:
|
63
|
+
|
64
|
+
git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
|
65
|
+
git submodule init
|
66
|
+
git submodule update
|
data/Rakefile
CHANGED
@@ -7,10 +7,9 @@ require 'spec/rake/spectask'
|
|
7
7
|
require 'spec/rake/verify_rcov'
|
8
8
|
require 'digest'
|
9
9
|
|
10
|
-
|
11
10
|
spec = Gem::Specification.new do |s|
|
12
11
|
s.name = "twitter-text"
|
13
|
-
s.version = "1.0.
|
12
|
+
s.version = "1.0.2"
|
14
13
|
s.author = "Matt Sanford"
|
15
14
|
s.email = "matt@twitter.com"
|
16
15
|
s.homepage = "http://twitter.com"
|
data/lib/extractor.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
module Twitter
|
3
2
|
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
4
3
|
# of usernames, lists, URLs and hashtags.
|
@@ -13,7 +12,9 @@ module Twitter
|
|
13
12
|
return [] unless text
|
14
13
|
|
15
14
|
possible_screen_names = []
|
16
|
-
text.scan(Twitter::Regex[:extract_mentions])
|
15
|
+
text.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
|
16
|
+
possible_screen_names << sn unless after =~ Twitter::Regex[:at_signs]
|
17
|
+
end
|
17
18
|
possible_screen_names.each{|sn| yield sn } if block_given?
|
18
19
|
possible_screen_names
|
19
20
|
end
|
@@ -39,7 +40,6 @@ module Twitter
|
|
39
40
|
# If a block is given then it will be called for each URL.
|
40
41
|
def extract_urls(text) # :yields: url
|
41
42
|
return [] unless text
|
42
|
-
|
43
43
|
urls = []
|
44
44
|
text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
45
45
|
urls << (protocol == "www." ? "http://#{url}" : url)
|
@@ -66,4 +66,4 @@ module Twitter
|
|
66
66
|
end
|
67
67
|
|
68
68
|
end
|
69
|
-
end
|
69
|
+
end
|
data/lib/regex.rb
CHANGED
@@ -26,8 +26,9 @@ module Twitter
|
|
26
26
|
].flatten.freeze
|
27
27
|
REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
|
28
28
|
|
29
|
-
REGEXEN[:
|
30
|
-
REGEXEN[:
|
29
|
+
REGEXEN[:at_signs] = /[@@]/
|
30
|
+
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
|
31
|
+
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
31
32
|
|
32
33
|
REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
|
33
34
|
|
@@ -42,9 +43,9 @@ module Twitter
|
|
42
43
|
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
|
43
44
|
|
44
45
|
# URL related hash regex collection
|
45
|
-
REGEXEN[:
|
46
|
-
REGEXEN[:valid_domain] = /(?:[\.-]|[^[:punct:]])+\.[a-z]{2,}(?::[0-9]+)?/i
|
47
|
-
REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_
|
46
|
+
REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
|
47
|
+
REGEXEN[:valid_domain] = /(?:[\.-]|[^[:punct:]\s])+\.[a-z]{2,}(?::[0-9]+)?/i
|
48
|
+
REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~@]/i
|
48
49
|
# Valid end-of-path chracters (so /foo. does not gobble the period).
|
49
50
|
# 1. Allow ) for Wikipedia URLs.
|
50
51
|
# 2. Allow =&# for empty URL parameters and other URL-join artifacts
|
@@ -53,7 +54,7 @@ module Twitter
|
|
53
54
|
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
|
54
55
|
REGEXEN[:valid_url] = %r{
|
55
56
|
( # $1 total match
|
56
|
-
(#{REGEXEN[:
|
57
|
+
(#{REGEXEN[:valid_preceding_chars]}) # $2 Preceeding chracter
|
57
58
|
( # $3 URL
|
58
59
|
(https?:\/\/|www\.) # $4 Protocol or beginning
|
59
60
|
(#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
|
data/lib/twitter-text.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
|
3
2
|
|
4
3
|
require 'rubygems'
|
@@ -10,4 +9,4 @@ require File.join(File.dirname(__FILE__), 'regex')
|
|
10
9
|
require File.join(File.dirname(__FILE__), 'autolink')
|
11
10
|
require File.join(File.dirname(__FILE__), 'extractor')
|
12
11
|
require File.join(File.dirname(__FILE__), 'unicode')
|
13
|
-
require File.join(File.dirname(__FILE__), 'validation')
|
12
|
+
require File.join(File.dirname(__FILE__), 'validation')
|
data/lib/unicode.rb
CHANGED
data/lib/validation.rb
CHANGED
data/spec/extractor_spec.rb
CHANGED
@@ -59,21 +59,21 @@ describe Twitter::Extractor do
|
|
59
59
|
@extractor.extract_reply_screen_name("@alice reply text").should == "alice"
|
60
60
|
end
|
61
61
|
|
62
|
-
it "should extract
|
62
|
+
it "should extract preceded by a space" do
|
63
63
|
@extractor.extract_reply_screen_name(" @alice reply text").should == "alice"
|
64
64
|
end
|
65
65
|
|
66
|
-
it "should extract
|
66
|
+
it "should extract preceded by a full-width space" do
|
67
67
|
@extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text").should == "alice"
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
71
|
context "should not be extracted from" do
|
72
|
-
it "should not be extracted when
|
72
|
+
it "should not be extracted when preceded by text" do
|
73
73
|
@extractor.extract_reply_screen_name("reply @alice text").should == nil
|
74
74
|
end
|
75
75
|
|
76
|
-
it "should not be extracted when
|
76
|
+
it "should not be extracted when preceded by puctuation" do
|
77
77
|
%w(. / _ - + # ! @).each do |punct|
|
78
78
|
@extractor.extract_reply_screen_name("#{punct}@alice text").should == nil
|
79
79
|
end
|
@@ -99,39 +99,21 @@ describe Twitter::Extractor do
|
|
99
99
|
|
100
100
|
describe "urls" do
|
101
101
|
describe "matching URLS" do
|
102
|
-
|
103
|
-
"
|
104
|
-
|
105
|
-
"http://google.com/#foo",
|
106
|
-
"http://google.com/#search?q=iphone%20-filter%3Alinks",
|
107
|
-
"http://twitter.com/#search?q=iphone%20-filter%3Alinks",
|
108
|
-
"http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
|
109
|
-
"http://somehost.com:3000",
|
110
|
-
"http://x.com/~matthew+%-x",
|
111
|
-
"http://en.wikipedia.org/wiki/Primer_(film)",
|
112
|
-
"http://www.ams.org/bookstore-getitem/item=mbk-59",
|
113
|
-
"http://chilp.it/?77e8fd",
|
114
|
-
]
|
115
|
-
|
116
|
-
@urls.each do |url|
|
117
|
-
it "should extract the URL #{url}" do
|
118
|
-
@extractor.extract_urls(url).should == [url]
|
102
|
+
TestUrls::VALID.each do |url|
|
103
|
+
it "should extract the URL #{url} and prefix it with a protocol if missing" do
|
104
|
+
@extractor.extract_urls(url).first.should include(url)
|
119
105
|
end
|
120
106
|
|
121
107
|
it "should match the URL #{url} when it's embedded in other text" do
|
122
108
|
text = "Sweet url: #{url} I found. #awesome"
|
123
|
-
@extractor.extract_urls(text).should
|
109
|
+
@extractor.extract_urls(text).first.should include(url)
|
124
110
|
end
|
125
111
|
end
|
126
112
|
end
|
127
113
|
|
128
114
|
describe "invalid URLS" do
|
129
|
-
|
130
|
-
|
131
|
-
"http://no-tld",
|
132
|
-
"http://tld-too-short.x",
|
133
|
-
"http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
|
134
|
-
].each {|url| @extractor.extract_urls(url).should == [] }
|
115
|
+
it "does not link urls with invalid domains" do
|
116
|
+
@extractor.extract_urls("http://tld-too-short.x").should == []
|
135
117
|
end
|
136
118
|
end
|
137
119
|
end
|
@@ -150,7 +132,6 @@ describe Twitter::Extractor do
|
|
150
132
|
end
|
151
133
|
|
152
134
|
context "international hashtags" do
|
153
|
-
|
154
135
|
context "should allow accents" do
|
155
136
|
%w(mañana café münchen).each do |hashtag|
|
156
137
|
it "should extract ##{hashtag}" do
|
data/spec/regex_spec.rb
CHANGED
@@ -2,27 +2,7 @@ require File.dirname(__FILE__) + '/spec_helper'
|
|
2
2
|
|
3
3
|
describe "Twitter::Regex regular expressions" do
|
4
4
|
describe "matching URLS" do
|
5
|
-
|
6
|
-
"http://google.com",
|
7
|
-
"http://foobar.com/#",
|
8
|
-
"http://google.com/#foo",
|
9
|
-
"http://google.com/#search?q=iphone%20-filter%3Alinks",
|
10
|
-
"http://twitter.com/#search?q=iphone%20-filter%3Alinks",
|
11
|
-
"http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
|
12
|
-
"http://somehost.com:3000",
|
13
|
-
"http://x.com/~matthew+%-x",
|
14
|
-
"http://en.wikipedia.org/wiki/Primer_(film)",
|
15
|
-
"http://www.ams.org/bookstore-getitem/item=mbk-59",
|
16
|
-
"http://chilp.it/?77e8fd",
|
17
|
-
"www.foobar.com",
|
18
|
-
"WWW.FOOBAR.COM",
|
19
|
-
"http://tell.me/why",
|
20
|
-
"http://longtlds.mobi",
|
21
|
-
"http://✪df.ws/ejp",
|
22
|
-
"http://日本.com"
|
23
|
-
]
|
24
|
-
|
25
|
-
@urls.each do |url|
|
5
|
+
TestUrls::VALID.each do |url|
|
26
6
|
it "should match the URL #{url}" do
|
27
7
|
url.should match_autolink_expression
|
28
8
|
end
|
@@ -36,19 +16,8 @@ describe "Twitter::Regex regular expressions" do
|
|
36
16
|
|
37
17
|
describe "invalid URLS" do
|
38
18
|
it "does not link urls with invalid characters" do
|
39
|
-
|
40
|
-
"http://no-tld",
|
41
|
-
"http://tld-too-short.x",
|
42
|
-
"http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
|
43
|
-
"http://doman_dash_2314352345_dfasd.foo-cow_4352.com",
|
44
|
-
].each {|url| url.should_not have_autolinked_url(url)}
|
45
|
-
end
|
46
|
-
|
47
|
-
it "does not link domains beginning with a hypen" do
|
48
|
-
pending
|
49
|
-
"http://-doman_dash_2314352345_dfasd.com".should_not match_autolink_expression
|
19
|
+
TestUrls::INVALID.each {|url| url.should_not have_autolinked_url(url)}
|
50
20
|
end
|
51
|
-
|
52
21
|
end
|
53
22
|
|
54
23
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -3,6 +3,11 @@ $:.push File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'twitter-text'
|
5
5
|
require 'hpricot'
|
6
|
+
require 'spec/test_urls'
|
7
|
+
|
8
|
+
Spec::Runner.configure do |config|
|
9
|
+
config.include TestUrls
|
10
|
+
end
|
6
11
|
|
7
12
|
Spec::Matchers.define :match_autolink_expression do
|
8
13
|
match do |string|
|
@@ -81,6 +86,10 @@ Spec::Matchers.define :have_autolinked_hashtag do |hashtag|
|
|
81
86
|
end
|
82
87
|
|
83
88
|
failure_message_for_should do |text|
|
84
|
-
|
89
|
+
if @link
|
90
|
+
"Expected link text to be #{hashtag}, but it was #{@link.inner_text}"
|
91
|
+
else
|
92
|
+
"Expected hashtag #{hashtag} to be autolinked in '#{text}', but no link was found."
|
93
|
+
end
|
85
94
|
end
|
86
95
|
end
|
data/spec/test_urls.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
module TestUrls
|
2
|
+
VALID = [
|
3
|
+
"http://google.com",
|
4
|
+
"http://foobar.com/#",
|
5
|
+
"http://google.com/#foo",
|
6
|
+
"http://google.com/#search?q=iphone%20-filter%3Alinks",
|
7
|
+
"http://twitter.com/#search?q=iphone%20-filter%3Alinks",
|
8
|
+
"http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
|
9
|
+
"http://somehost.com:3000",
|
10
|
+
"http://x.com/~matthew+%-x",
|
11
|
+
"http://en.wikipedia.org/wiki/Primer_(film)",
|
12
|
+
"http://www.ams.org/bookstore-getitem/item=mbk-59",
|
13
|
+
"http://chilp.it/?77e8fd",
|
14
|
+
"www.foobar.com",
|
15
|
+
"WWW.FOOBAR.COM",
|
16
|
+
"http://tell.me/why",
|
17
|
+
"http://longtlds.info",
|
18
|
+
"http://✪df.ws/ejp",
|
19
|
+
"http://日本.com"
|
20
|
+
]
|
21
|
+
|
22
|
+
INVALID = [
|
23
|
+
"http://no-tld",
|
24
|
+
"http://tld-too-short.x",
|
25
|
+
"http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
|
26
|
+
"http://domain-dash.com",
|
27
|
+
"http://-doman_dash.com"
|
28
|
+
]
|
29
|
+
|
30
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matt Sanford
|
@@ -9,7 +9,7 @@ autorequire: ""
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-03-05 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -45,6 +45,7 @@ files:
|
|
45
45
|
- spec/extractor_spec.rb
|
46
46
|
- spec/regex_spec.rb
|
47
47
|
- spec/spec_helper.rb
|
48
|
+
- spec/test_urls.rb
|
48
49
|
- spec/unicode_spec.rb
|
49
50
|
- spec/validation_spec.rb
|
50
51
|
has_rdoc: true
|