twitter-text 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +8 -0
- data/Rakefile +1 -2
- data/lib/extractor.rb +4 -4
- data/lib/regex.rb +7 -6
- data/lib/twitter-text.rb +1 -2
- data/lib/unicode.rb +1 -2
- data/lib/validation.rb +1 -2
- data/spec/extractor_spec.rb +10 -29
- data/spec/regex_spec.rb +2 -33
- data/spec/spec_helper.rb +10 -1
- data/spec/test_urls.rb +30 -0
- metadata +3 -2
data/README.rdoc
CHANGED
@@ -56,3 +56,11 @@ between words.
|
|
56
56
|
Special care has been taken to be sure that auto-linking and extraction work
|
57
57
|
in Tweets of all languages. This means that languages without spaces between
|
58
58
|
words should work equally well.
|
59
|
+
|
60
|
+
=== Conformance
|
61
|
+
|
62
|
+
To run the Conformance suite, you'll need to add that project as a git submodule. From the root twitter-text-rb directory, run:
|
63
|
+
|
64
|
+
git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
|
65
|
+
git submodule init
|
66
|
+
git submodule update
|
data/Rakefile
CHANGED
@@ -7,10 +7,9 @@ require 'spec/rake/spectask'
|
|
7
7
|
require 'spec/rake/verify_rcov'
|
8
8
|
require 'digest'
|
9
9
|
|
10
|
-
|
11
10
|
spec = Gem::Specification.new do |s|
|
12
11
|
s.name = "twitter-text"
|
13
|
-
s.version = "1.0.
|
12
|
+
s.version = "1.0.2"
|
14
13
|
s.author = "Matt Sanford"
|
15
14
|
s.email = "matt@twitter.com"
|
16
15
|
s.homepage = "http://twitter.com"
|
data/lib/extractor.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
module Twitter
|
3
2
|
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
4
3
|
# of usernames, lists, URLs and hashtags.
|
@@ -13,7 +12,9 @@ module Twitter
|
|
13
12
|
return [] unless text
|
14
13
|
|
15
14
|
possible_screen_names = []
|
16
|
-
text.scan(Twitter::Regex[:extract_mentions])
|
15
|
+
text.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
|
16
|
+
possible_screen_names << sn unless after =~ Twitter::Regex[:at_signs]
|
17
|
+
end
|
17
18
|
possible_screen_names.each{|sn| yield sn } if block_given?
|
18
19
|
possible_screen_names
|
19
20
|
end
|
@@ -39,7 +40,6 @@ module Twitter
|
|
39
40
|
# If a block is given then it will be called for each URL.
|
40
41
|
def extract_urls(text) # :yields: url
|
41
42
|
return [] unless text
|
42
|
-
|
43
43
|
urls = []
|
44
44
|
text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
45
45
|
urls << (protocol == "www." ? "http://#{url}" : url)
|
@@ -66,4 +66,4 @@ module Twitter
|
|
66
66
|
end
|
67
67
|
|
68
68
|
end
|
69
|
-
end
|
69
|
+
end
|
data/lib/regex.rb
CHANGED
@@ -26,8 +26,9 @@ module Twitter
|
|
26
26
|
].flatten.freeze
|
27
27
|
REGEXEN[:spaces] = Regexp.new(UNICODE_SPACES.collect{ |e| [e].pack 'U*' }.join('|'))
|
28
28
|
|
29
|
-
REGEXEN[:
|
30
|
-
REGEXEN[:
|
29
|
+
REGEXEN[:at_signs] = /[@@]/
|
30
|
+
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
|
31
|
+
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
31
32
|
|
32
33
|
REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
|
33
34
|
|
@@ -42,9 +43,9 @@ module Twitter
|
|
42
43
|
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
|
43
44
|
|
44
45
|
# URL related hash regex collection
|
45
|
-
REGEXEN[:
|
46
|
-
REGEXEN[:valid_domain] = /(?:[\.-]|[^[:punct:]])+\.[a-z]{2,}(?::[0-9]+)?/i
|
47
|
-
REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_
|
46
|
+
REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
|
47
|
+
REGEXEN[:valid_domain] = /(?:[\.-]|[^[:punct:]\s])+\.[a-z]{2,}(?::[0-9]+)?/i
|
48
|
+
REGEXEN[:valid_url_path_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~@]/i
|
48
49
|
# Valid end-of-path chracters (so /foo. does not gobble the period).
|
49
50
|
# 1. Allow ) for Wikipedia URLs.
|
50
51
|
# 2. Allow =&# for empty URL parameters and other URL-join artifacts
|
@@ -53,7 +54,7 @@ module Twitter
|
|
53
54
|
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
|
54
55
|
REGEXEN[:valid_url] = %r{
|
55
56
|
( # $1 total match
|
56
|
-
(#{REGEXEN[:
|
57
|
+
(#{REGEXEN[:valid_preceding_chars]}) # $2 Preceeding chracter
|
57
58
|
( # $3 URL
|
58
59
|
(https?:\/\/|www\.) # $4 Protocol or beginning
|
59
60
|
(#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
|
data/lib/twitter-text.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
|
3
2
|
|
4
3
|
require 'rubygems'
|
@@ -10,4 +9,4 @@ require File.join(File.dirname(__FILE__), 'regex')
|
|
10
9
|
require File.join(File.dirname(__FILE__), 'autolink')
|
11
10
|
require File.join(File.dirname(__FILE__), 'extractor')
|
12
11
|
require File.join(File.dirname(__FILE__), 'unicode')
|
13
|
-
require File.join(File.dirname(__FILE__), 'validation')
|
12
|
+
require File.join(File.dirname(__FILE__), 'validation')
|
data/lib/unicode.rb
CHANGED
data/lib/validation.rb
CHANGED
data/spec/extractor_spec.rb
CHANGED
@@ -59,21 +59,21 @@ describe Twitter::Extractor do
|
|
59
59
|
@extractor.extract_reply_screen_name("@alice reply text").should == "alice"
|
60
60
|
end
|
61
61
|
|
62
|
-
it "should extract
|
62
|
+
it "should extract preceded by a space" do
|
63
63
|
@extractor.extract_reply_screen_name(" @alice reply text").should == "alice"
|
64
64
|
end
|
65
65
|
|
66
|
-
it "should extract
|
66
|
+
it "should extract preceded by a full-width space" do
|
67
67
|
@extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text").should == "alice"
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
71
|
context "should not be extracted from" do
|
72
|
-
it "should not be extracted when
|
72
|
+
it "should not be extracted when preceded by text" do
|
73
73
|
@extractor.extract_reply_screen_name("reply @alice text").should == nil
|
74
74
|
end
|
75
75
|
|
76
|
-
it "should not be extracted when
|
76
|
+
it "should not be extracted when preceded by puctuation" do
|
77
77
|
%w(. / _ - + # ! @).each do |punct|
|
78
78
|
@extractor.extract_reply_screen_name("#{punct}@alice text").should == nil
|
79
79
|
end
|
@@ -99,39 +99,21 @@ describe Twitter::Extractor do
|
|
99
99
|
|
100
100
|
describe "urls" do
|
101
101
|
describe "matching URLS" do
|
102
|
-
|
103
|
-
"
|
104
|
-
|
105
|
-
"http://google.com/#foo",
|
106
|
-
"http://google.com/#search?q=iphone%20-filter%3Alinks",
|
107
|
-
"http://twitter.com/#search?q=iphone%20-filter%3Alinks",
|
108
|
-
"http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
|
109
|
-
"http://somehost.com:3000",
|
110
|
-
"http://x.com/~matthew+%-x",
|
111
|
-
"http://en.wikipedia.org/wiki/Primer_(film)",
|
112
|
-
"http://www.ams.org/bookstore-getitem/item=mbk-59",
|
113
|
-
"http://chilp.it/?77e8fd",
|
114
|
-
]
|
115
|
-
|
116
|
-
@urls.each do |url|
|
117
|
-
it "should extract the URL #{url}" do
|
118
|
-
@extractor.extract_urls(url).should == [url]
|
102
|
+
TestUrls::VALID.each do |url|
|
103
|
+
it "should extract the URL #{url} and prefix it with a protocol if missing" do
|
104
|
+
@extractor.extract_urls(url).first.should include(url)
|
119
105
|
end
|
120
106
|
|
121
107
|
it "should match the URL #{url} when it's embedded in other text" do
|
122
108
|
text = "Sweet url: #{url} I found. #awesome"
|
123
|
-
@extractor.extract_urls(text).should
|
109
|
+
@extractor.extract_urls(text).first.should include(url)
|
124
110
|
end
|
125
111
|
end
|
126
112
|
end
|
127
113
|
|
128
114
|
describe "invalid URLS" do
|
129
|
-
|
130
|
-
|
131
|
-
"http://no-tld",
|
132
|
-
"http://tld-too-short.x",
|
133
|
-
"http://doman-dash_2314352345_dfasd.foo-cow_4352.com",
|
134
|
-
].each {|url| @extractor.extract_urls(url).should == [] }
|
115
|
+
it "does not link urls with invalid domains" do
|
116
|
+
@extractor.extract_urls("http://tld-too-short.x").should == []
|
135
117
|
end
|
136
118
|
end
|
137
119
|
end
|
@@ -150,7 +132,6 @@ describe Twitter::Extractor do
|
|
150
132
|
end
|
151
133
|
|
152
134
|
context "international hashtags" do
|
153
|
-
|
154
135
|
context "should allow accents" do
|
155
136
|
%w(mañana café münchen).each do |hashtag|
|
156
137
|
it "should extract ##{hashtag}" do
|
data/spec/regex_spec.rb
CHANGED
@@ -2,27 +2,7 @@ require File.dirname(__FILE__) + '/spec_helper'
|
|
2
2
|
|
3
3
|
describe "Twitter::Regex regular expressions" do
|
4
4
|
describe "matching URLS" do
|
5
|
-
|
6
|
-
"http://google.com",
|
7
|
-
"http://foobar.com/#",
|
8
|
-
"http://google.com/#foo",
|
9
|
-
"http://google.com/#search?q=iphone%20-filter%3Alinks",
|
10
|
-
"http://twitter.com/#search?q=iphone%20-filter%3Alinks",
|
11
|
-
"http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
|
12
|
-
"http://somehost.com:3000",
|
13
|
-
"http://x.com/~matthew+%-x",
|
14
|
-
"http://en.wikipedia.org/wiki/Primer_(film)",
|
15
|
-
"http://www.ams.org/bookstore-getitem/item=mbk-59",
|
16
|
-
"http://chilp.it/?77e8fd",
|
17
|
-
"www.foobar.com",
|
18
|
-
"WWW.FOOBAR.COM",
|
19
|
-
"http://tell.me/why",
|
20
|
-
"http://longtlds.mobi",
|
21
|
-
"http://✪df.ws/ejp",
|
22
|
-
"http://日本.com"
|
23
|
-
]
|
24
|
-
|
25
|
-
@urls.each do |url|
|
5
|
+
TestUrls::VALID.each do |url|
|
26
6
|
it "should match the URL #{url}" do
|
27
7
|
url.should match_autolink_expression
|
28
8
|
end
|
@@ -36,19 +16,8 @@ describe "Twitter::Regex regular expressions" do
|
|
36
16
|
|
37
17
|
describe "invalid URLS" do
|
38
18
|
it "does not link urls with invalid characters" do
|
39
|
-
|
40
|
-
"http://no-tld",
|
41
|
-
"http://tld-too-short.x",
|
42
|
-
"http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
|
43
|
-
"http://doman_dash_2314352345_dfasd.foo-cow_4352.com",
|
44
|
-
].each {|url| url.should_not have_autolinked_url(url)}
|
45
|
-
end
|
46
|
-
|
47
|
-
it "does not link domains beginning with a hypen" do
|
48
|
-
pending
|
49
|
-
"http://-doman_dash_2314352345_dfasd.com".should_not match_autolink_expression
|
19
|
+
TestUrls::INVALID.each {|url| url.should_not have_autolinked_url(url)}
|
50
20
|
end
|
51
|
-
|
52
21
|
end
|
53
22
|
|
54
23
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -3,6 +3,11 @@ $:.push File.join(File.dirname(__FILE__), '..', 'lib')
|
|
3
3
|
|
4
4
|
require 'twitter-text'
|
5
5
|
require 'hpricot'
|
6
|
+
require 'spec/test_urls'
|
7
|
+
|
8
|
+
Spec::Runner.configure do |config|
|
9
|
+
config.include TestUrls
|
10
|
+
end
|
6
11
|
|
7
12
|
Spec::Matchers.define :match_autolink_expression do
|
8
13
|
match do |string|
|
@@ -81,6 +86,10 @@ Spec::Matchers.define :have_autolinked_hashtag do |hashtag|
|
|
81
86
|
end
|
82
87
|
|
83
88
|
failure_message_for_should do |text|
|
84
|
-
|
89
|
+
if @link
|
90
|
+
"Expected link text to be #{hashtag}, but it was #{@link.inner_text}"
|
91
|
+
else
|
92
|
+
"Expected hashtag #{hashtag} to be autolinked in '#{text}', but no link was found."
|
93
|
+
end
|
85
94
|
end
|
86
95
|
end
|
data/spec/test_urls.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
module TestUrls
|
2
|
+
VALID = [
|
3
|
+
"http://google.com",
|
4
|
+
"http://foobar.com/#",
|
5
|
+
"http://google.com/#foo",
|
6
|
+
"http://google.com/#search?q=iphone%20-filter%3Alinks",
|
7
|
+
"http://twitter.com/#search?q=iphone%20-filter%3Alinks",
|
8
|
+
"http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html",
|
9
|
+
"http://somehost.com:3000",
|
10
|
+
"http://x.com/~matthew+%-x",
|
11
|
+
"http://en.wikipedia.org/wiki/Primer_(film)",
|
12
|
+
"http://www.ams.org/bookstore-getitem/item=mbk-59",
|
13
|
+
"http://chilp.it/?77e8fd",
|
14
|
+
"www.foobar.com",
|
15
|
+
"WWW.FOOBAR.COM",
|
16
|
+
"http://tell.me/why",
|
17
|
+
"http://longtlds.info",
|
18
|
+
"http://✪df.ws/ejp",
|
19
|
+
"http://日本.com"
|
20
|
+
]
|
21
|
+
|
22
|
+
INVALID = [
|
23
|
+
"http://no-tld",
|
24
|
+
"http://tld-too-short.x",
|
25
|
+
"http://x.com/,,,/.../@@@/;;;/:::/---/%%%x",
|
26
|
+
"http://domain-dash.com",
|
27
|
+
"http://-doman_dash.com"
|
28
|
+
]
|
29
|
+
|
30
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Matt Sanford
|
@@ -9,7 +9,7 @@ autorequire: ""
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-03-05 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -45,6 +45,7 @@ files:
|
|
45
45
|
- spec/extractor_spec.rb
|
46
46
|
- spec/regex_spec.rb
|
47
47
|
- spec/spec_helper.rb
|
48
|
+
- spec/test_urls.rb
|
48
49
|
- spec/unicode_spec.rb
|
49
50
|
- spec/validation_spec.rb
|
50
51
|
has_rdoc: true
|