twitter-text 1.1.2 → 1.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +16 -1
- data/Rakefile +5 -8
- data/lib/extractor.rb +106 -8
- data/lib/hithighlighter.rb +3 -1
- data/lib/regex.rb +17 -7
- data/lib/twitter-text.rb +1 -1
- data/spec/autolinking_spec.rb +45 -4
- data/spec/extractor_spec.rb +149 -1
- data/spec/hithighlighter_spec.rb +1 -0
- data/spec/regex_spec.rb +1 -0
- data/spec/test_urls.rb +3 -0
- data/spec/unicode_spec.rb +1 -0
- data/spec/validation_spec.rb +1 -0
- metadata +20 -9
data/README.rdoc
CHANGED
@@ -26,7 +26,7 @@ usernames, lists, hashtags and URLs.
|
|
26
26
|
# Auto-link
|
27
27
|
class MyClass
|
28
28
|
include Twitter::Autolink
|
29
|
-
|
29
|
+
|
30
30
|
html = auto_link("link @user, please #request")
|
31
31
|
end
|
32
32
|
|
@@ -69,3 +69,18 @@ To run the Conformance suite, you'll need to add that project as a git submodule
|
|
69
69
|
git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
|
70
70
|
git submodule init
|
71
71
|
git submodule update
|
72
|
+
|
73
|
+
=== Thanks
|
74
|
+
|
75
|
+
Thanks to everybody who has filed issues, provided feedback or contributed patches. Patches courtesy of:
|
76
|
+
|
77
|
+
* At Twitter …
|
78
|
+
* Raffi Krikorian - http://github.com/r
|
79
|
+
* Ben Cherry - http://github.com/bcherry
|
80
|
+
* Patrick Ewing - http://github.com/hoverbird
|
81
|
+
* Jeff Smick - http://github.com/sprsquish
|
82
|
+
|
83
|
+
* Patches from the community …
|
84
|
+
* Jean-Philippe Bougie - http://github.com/jpbougie
|
85
|
+
|
86
|
+
* Anyone who has filed an issue. It helps. Really.
|
data/Rakefile
CHANGED
@@ -9,9 +9,9 @@ require 'digest'
|
|
9
9
|
|
10
10
|
spec = Gem::Specification.new do |s|
|
11
11
|
s.name = "twitter-text"
|
12
|
-
s.version = "1.1.
|
13
|
-
s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle"]
|
14
|
-
s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com"]
|
12
|
+
s.version = "1.1.4"
|
13
|
+
s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle", "Raffi Krikorian"]
|
14
|
+
s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com", "raffi@twitter.com"]
|
15
15
|
s.homepage = "http://twitter.com"
|
16
16
|
s.description = s.summary = "A gem that provides text handling for Twitter"
|
17
17
|
|
@@ -32,6 +32,7 @@ desc "Run specs"
|
|
32
32
|
Spec::Rake::SpecTask.new do |t|
|
33
33
|
t.spec_files = FileList['spec/**/*_spec.rb']
|
34
34
|
t.spec_opts = %w(-fs --color)
|
35
|
+
t.libs << ["spec", '.']
|
35
36
|
end
|
36
37
|
|
37
38
|
desc "Run all examples with RCov"
|
@@ -41,15 +42,12 @@ Spec::Rake::SpecTask.new('spec:rcov') do |t|
|
|
41
42
|
t.rcov_opts = ['--exclude', 'spec']
|
42
43
|
end
|
43
44
|
|
44
|
-
|
45
45
|
def conformance_version(dir)
|
46
46
|
Dir[File.join(dir, '*')].inject(Digest::SHA1.new){|digest, file| digest.update(Digest::SHA1.file(file).hexdigest) }
|
47
47
|
end
|
48
48
|
|
49
49
|
namespace :test do
|
50
50
|
namespace :conformance do
|
51
|
-
|
52
|
-
|
53
51
|
desc "Update conformance testing data"
|
54
52
|
task :update do
|
55
53
|
puts "Updating conformance data ... "
|
@@ -85,8 +83,7 @@ namespace :test do
|
|
85
83
|
end
|
86
84
|
end
|
87
85
|
|
88
|
-
|
89
|
-
namespace :doc do
|
86
|
+
namespace :doc do
|
90
87
|
Rake::RDocTask.new do |rd|
|
91
88
|
rd.main = "README.rdoc"
|
92
89
|
rd.rdoc_dir = 'doc'
|
data/lib/extractor.rb
CHANGED
@@ -1,3 +1,41 @@
|
|
1
|
+
class String
|
2
|
+
# Helper function to count the character length by first converting to an
|
3
|
+
# array. This is needed because with unicode strings, the return value
|
4
|
+
# of length may be incorrect
|
5
|
+
def char_length
|
6
|
+
if respond_to? :codepoints
|
7
|
+
length
|
8
|
+
else
|
9
|
+
chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# Helper function to convert this string into an array of unicode characters.
|
14
|
+
def to_char_a
|
15
|
+
@to_char_a ||= if chars.kind_of?(Enumerable)
|
16
|
+
chars.to_a
|
17
|
+
else
|
18
|
+
char_array = []
|
19
|
+
0.upto(char_length - 1) { |i| char_array << [chars.slice(i)].pack('U') }
|
20
|
+
char_array
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Helper function to find the index of the <tt>sub_string</tt> in
|
25
|
+
# <tt>str</tt>. This is needed because with unicode strings, the return
|
26
|
+
# of index may be incorrect.
|
27
|
+
def sub_string_search(sub_str, position = 0)
|
28
|
+
if respond_to? :codepoints
|
29
|
+
index(sub_str, position)
|
30
|
+
else
|
31
|
+
index = to_char_a[position..-1].each_with_index.find do |e|
|
32
|
+
to_char_a.slice(e.last + position, sub_str.char_length).map{|ci| ci.first }.join == sub_str
|
33
|
+
end
|
34
|
+
index.nil? ? -1 : index.last + position
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
1
39
|
module Twitter
|
2
40
|
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
3
41
|
# of usernames, lists, URLs and hashtags.
|
@@ -9,13 +47,38 @@ module Twitter
|
|
9
47
|
#
|
10
48
|
# If a block is given then it will be called for each username.
|
11
49
|
def extract_mentioned_screen_names(text) # :yields: username
|
50
|
+
screen_names_only = extract_mentioned_screen_names_with_indices(text).map{|mention| mention[:screen_name] }
|
51
|
+
screen_names_only.each{|mention| yield mention } if block_given?
|
52
|
+
screen_names_only
|
53
|
+
end
|
54
|
+
|
55
|
+
# Extracts a list of all usersnames mentioned in the Tweet <tt>text</tt>
|
56
|
+
# along with the indices for where the mention ocurred. If the
|
57
|
+
# <tt>text</tt> is nil or contains no username mentions, an empty array
|
58
|
+
# will be returned.
|
59
|
+
#
|
60
|
+
# If a block is given, then it will be called with each username, the start
|
61
|
+
# index, and the end index in the <tt>text</tt>.
|
62
|
+
def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
|
12
63
|
return [] unless text
|
13
64
|
|
14
65
|
possible_screen_names = []
|
15
|
-
|
16
|
-
|
66
|
+
position = 0
|
67
|
+
text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
|
68
|
+
unless after =~ Twitter::Regex[:at_signs]
|
69
|
+
start_position = text.to_s.sub_string_search(sn, position) - 1
|
70
|
+
position = start_position + sn.char_length + 1
|
71
|
+
possible_screen_names << {
|
72
|
+
:screen_name => sn,
|
73
|
+
:indices => [start_position, position]
|
74
|
+
}
|
75
|
+
end
|
76
|
+
end
|
77
|
+
if block_given?
|
78
|
+
possible_screen_names.each do |mention|
|
79
|
+
yield mention[:screen_name], mention[:indices].first, mention[:indices].last
|
80
|
+
end
|
17
81
|
end
|
18
|
-
possible_screen_names.each{|sn| yield sn } if block_given?
|
19
82
|
possible_screen_names
|
20
83
|
end
|
21
84
|
|
@@ -39,12 +102,30 @@ module Twitter
|
|
39
102
|
#
|
40
103
|
# If a block is given then it will be called for each URL.
|
41
104
|
def extract_urls(text) # :yields: url
|
105
|
+
urls_only = extract_urls_with_indices(text).map{|url| url[:url] }
|
106
|
+
urls_only.each{|url| yield url } if block_given?
|
107
|
+
urls_only
|
108
|
+
end
|
109
|
+
|
110
|
+
# Extracts a list of all URLs included in the Tweet <tt>text</tt> along
|
111
|
+
# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
|
112
|
+
# URLs an empty array will be returned.
|
113
|
+
#
|
114
|
+
# If a block is given then it will be called for each URL.
|
115
|
+
def extract_urls_with_indices(text) # :yields: url, start, end
|
42
116
|
return [] unless text
|
43
117
|
urls = []
|
118
|
+
position = 0
|
44
119
|
text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
45
|
-
|
120
|
+
start_position = text.to_s.sub_string_search(url, position)
|
121
|
+
end_position = start_position + url.char_length
|
122
|
+
position = end_position
|
123
|
+
urls << {
|
124
|
+
:url => (protocol == "www." ? "http://#{url}" : url),
|
125
|
+
:indices => [start_position, end_position]
|
126
|
+
}
|
46
127
|
end
|
47
|
-
urls.each{|url| yield url } if block_given?
|
128
|
+
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last } if block_given?
|
48
129
|
urls
|
49
130
|
end
|
50
131
|
|
@@ -55,15 +136,32 @@ module Twitter
|
|
55
136
|
#
|
56
137
|
# If a block is given then it will be called for each hashtag.
|
57
138
|
def extract_hashtags(text) # :yields: hashtag_text
|
139
|
+
hashtags_only = extract_hashtags_with_indices(text).map{|hash| hash[:hashtag] }
|
140
|
+
hashtags_only.each{|hash| yield hash } if block_given?
|
141
|
+
hashtags_only
|
142
|
+
end
|
143
|
+
|
144
|
+
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
145
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
|
146
|
+
# will be returned. The array returned will not include the leading <tt>#</tt>
|
147
|
+
# character.
|
148
|
+
#
|
149
|
+
# If a block is given then it will be called for each hashtag.
|
150
|
+
def extract_hashtags_with_indices(text) # :yields: hashtag_text, start, end
|
58
151
|
return [] unless text
|
59
152
|
|
60
153
|
tags = []
|
154
|
+
position = 0
|
61
155
|
text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
|
62
|
-
|
156
|
+
start_position = text.to_s.sub_string_search(hash, position)
|
157
|
+
position = start_position + hash_text.char_length + 1
|
158
|
+
tags << {
|
159
|
+
:hashtag => hash_text,
|
160
|
+
:indices => [start_position, position]
|
161
|
+
}
|
63
162
|
end
|
64
|
-
tags.each{|tag| yield tag } if block_given?
|
163
|
+
tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
|
65
164
|
tags
|
66
165
|
end
|
67
|
-
|
68
166
|
end
|
69
167
|
end
|
data/lib/hithighlighter.rb
CHANGED
@@ -65,7 +65,7 @@ module Twitter
|
|
65
65
|
end
|
66
66
|
placed = true
|
67
67
|
end
|
68
|
-
|
68
|
+
|
69
69
|
# ultimate fallback, hits that run off the end get a closing tag
|
70
70
|
if !placed
|
71
71
|
result << tag
|
@@ -82,6 +82,8 @@ module Twitter
|
|
82
82
|
end
|
83
83
|
|
84
84
|
result
|
85
|
+
rescue
|
86
|
+
text
|
85
87
|
end
|
86
88
|
end
|
87
89
|
end
|
data/lib/regex.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# coding: UTF-8
|
1
2
|
|
2
3
|
module Twitter
|
3
4
|
# A collection of regular expressions for parsing Tweet text. The regular expression
|
@@ -30,7 +31,7 @@ module Twitter
|
|
30
31
|
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
|
31
32
|
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
32
33
|
|
33
|
-
REGEXEN[:list_name] = /^[a-zA-Z\
|
34
|
+
REGEXEN[:list_name] = /^[a-zA-Z\u0080-\u00ff].{0,79}$/
|
34
35
|
|
35
36
|
# Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
|
36
37
|
LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
|
@@ -39,17 +40,24 @@ module Twitter
|
|
39
40
|
# Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
|
40
41
|
HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
|
41
42
|
REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
|
42
|
-
REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\
|
43
|
+
REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\u0080-\u00ff\-]{0,79})?/
|
43
44
|
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
|
44
45
|
|
45
46
|
# URL related hash regex collection
|
46
47
|
REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
|
47
48
|
REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i
|
48
|
-
|
49
|
+
|
50
|
+
# Allow URL paths to contain balanced parens
|
51
|
+
# 1. Used in Wikipedia URLs like /Primer_(film)
|
52
|
+
# 2. Used in IIS sessions like /S(dfd346)/
|
53
|
+
REGEXEN[:wikipedia_disambiguation] = /(?:\([^\)]+\))/i
|
54
|
+
REGEXEN[:valid_url_path_chars] = /(?:
|
55
|
+
#{REGEXEN[:wikipedia_disambiguation]}|
|
56
|
+
[\.\,]?[a-z0-9!\*';:=\+\$\/%#\[\]\-_,~@]
|
57
|
+
)/ix
|
49
58
|
# Valid end-of-path chracters (so /foo. does not gobble the period).
|
50
|
-
# 1. Allow
|
51
|
-
|
52
|
-
REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
|
59
|
+
# 1. Allow =&# for empty URL parameters and other URL-join artifacts
|
60
|
+
REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=#\/]/i
|
53
61
|
REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
|
54
62
|
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
|
55
63
|
REGEXEN[:valid_url] = %r{
|
@@ -58,7 +66,9 @@ module Twitter
|
|
58
66
|
( # $3 URL
|
59
67
|
(https?:\/\/|www\.) # $4 Protocol or beginning
|
60
68
|
(#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
|
61
|
-
(/#{REGEXEN[:valid_url_path_chars]}
|
69
|
+
(/#{REGEXEN[:valid_url_path_chars]}*
|
70
|
+
#{REGEXEN[:valid_url_path_ending_chars]}?
|
71
|
+
)? # $6 URL Path
|
62
72
|
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $7 Query String
|
63
73
|
)
|
64
74
|
)
|
data/lib/twitter-text.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
|
1
|
+
raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE) || ''.respond_to?(:codepoints)
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
4
|
|
data/spec/autolinking_spec.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
#encoding: UTF-8
|
2
|
+
# require File.dirname(__FILE__) + '/spec_helper'
|
3
|
+
require 'spec_helper'
|
2
4
|
|
3
5
|
class TestAutolink
|
4
6
|
include Twitter::Autolink
|
@@ -303,7 +305,7 @@ describe Twitter::Autolink do
|
|
303
305
|
|
304
306
|
it "should be linked" do
|
305
307
|
link = Hpricot(@autolinked_text).at('a')
|
306
|
-
link.inner_text.should == "#{[0xFF03].pack('U')}twj_dev"
|
308
|
+
(link.inner_text.respond_to?(:force_encoding) ? link.inner_text.force_encoding("utf-8") : link.inner_text).should == "#{[0xFF03].pack('U')}twj_dev"
|
307
309
|
link['href'].should == 'http://twitter.com/search?q=%23twj_dev'
|
308
310
|
end
|
309
311
|
end
|
@@ -329,7 +331,7 @@ describe Twitter::Autolink do
|
|
329
331
|
end
|
330
332
|
end
|
331
333
|
|
332
|
-
context "
|
334
|
+
context "with a path surrounded by parentheses;" do
|
333
335
|
def original_text; "I found a neatness (#{url})"; end
|
334
336
|
|
335
337
|
it "should be linked" do
|
@@ -340,10 +342,49 @@ describe Twitter::Autolink do
|
|
340
342
|
def url; "http://www.google.com/"; end
|
341
343
|
|
342
344
|
it "should be linked" do
|
343
|
-
pending # our support for Wikipedia URLS containing parens breaks this corner case
|
344
345
|
@autolinked_text.should have_autolinked_url(url)
|
345
346
|
end
|
346
347
|
end
|
348
|
+
|
349
|
+
context "when the URL has a path;" do
|
350
|
+
def url; "http://www.google.com/fsdfasdf"; end
|
351
|
+
|
352
|
+
it "should be linked" do
|
353
|
+
@autolinked_text.should have_autolinked_url(url)
|
354
|
+
end
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
context "when path contains parens" do
|
359
|
+
def original_text; "I found a neatness (#{url})"; end
|
360
|
+
|
361
|
+
it "should be linked" do
|
362
|
+
@autolinked_text.should have_autolinked_url(url)
|
363
|
+
end
|
364
|
+
|
365
|
+
context "wikipedia" do
|
366
|
+
def url; "http://en.wikipedia.org/wiki/Madonna_(artist)"; end
|
367
|
+
|
368
|
+
it "should be linked" do
|
369
|
+
@autolinked_text.should have_autolinked_url(url)
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
context "IIS session" do
|
374
|
+
def url; "http://msdn.com/S(deadbeef)/page.htm"; end
|
375
|
+
|
376
|
+
it "should be linked" do
|
377
|
+
@autolinked_text.should have_autolinked_url(url)
|
378
|
+
end
|
379
|
+
end
|
380
|
+
|
381
|
+
context "unbalanced parens" do
|
382
|
+
def url; "http://example.com/i_has_a_("; end
|
383
|
+
|
384
|
+
it "should be linked" do
|
385
|
+
@autolinked_text.should have_autolinked_url("http://example.com/i_has_a_")
|
386
|
+
end
|
387
|
+
end
|
347
388
|
end
|
348
389
|
|
349
390
|
context "when preceded by a :" do
|
data/spec/extractor_spec.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#encoding: UTF-8
|
1
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
2
3
|
|
3
4
|
class TestExtractor
|
@@ -49,6 +50,58 @@ describe Twitter::Extractor do
|
|
49
50
|
end
|
50
51
|
end
|
51
52
|
|
53
|
+
describe "mentions with indices" do
|
54
|
+
context "single screen name alone " do
|
55
|
+
it "should be linked and the correct indices" do
|
56
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice").should == [{:screen_name => "alice", :indices => [0, 6]}]
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should be linked with _ and the correct indices" do
|
60
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice_adams").should == [{:screen_name => "alice_adams", :indices => [0, 12]}]
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should be linked if numeric and the correct indices" do
|
64
|
+
@extractor.extract_mentioned_screen_names_with_indices("@1234").should == [{:screen_name => "1234", :indices => [0, 5]}]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context "multiple screen names" do
|
69
|
+
it "should both be linked with the correct indices" do
|
70
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice @bob").should ==
|
71
|
+
[{:screen_name => "alice", :indices => [0, 6]},
|
72
|
+
{:screen_name => "bob", :indices => [7, 11]}]
|
73
|
+
end
|
74
|
+
|
75
|
+
it "should be linked with the correct indices even when repeated" do
|
76
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice @alice @bob").should ==
|
77
|
+
[{:screen_name => "alice", :indices => [0, 6]},
|
78
|
+
{:screen_name => "alice", :indices => [7, 13]},
|
79
|
+
{:screen_name => "bob", :indices => [14, 18]}]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
context "screen names embedded in text" do
|
84
|
+
it "should be linked in Latin text with the correct indices" do
|
85
|
+
@extractor.extract_mentioned_screen_names_with_indices("waiting for @alice to arrive").should == [{:screen_name => "alice", :indices => [12, 18]}]
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should be linked in Japanese text with the correct indices" do
|
89
|
+
@extractor.extract_mentioned_screen_names_with_indices("の@aliceに到着を待っている").should == [{:screen_name => "alice", :indices => [1, 7]}]
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should accept a block arugment and call it in order" do
|
94
|
+
needed = [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}]
|
95
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice @bob") do |sn, start_index, end_index|
|
96
|
+
data = needed.shift
|
97
|
+
sn.should == data[:screen_name]
|
98
|
+
start_index.should == data[:indices].first
|
99
|
+
end_index.should == data[:indices].last
|
100
|
+
end
|
101
|
+
needed.should == []
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
52
105
|
describe "replies" do
|
53
106
|
context "should be extracted from" do
|
54
107
|
it "should extract from lone name" do
|
@@ -118,6 +171,37 @@ describe Twitter::Extractor do
|
|
118
171
|
end
|
119
172
|
end
|
120
173
|
|
174
|
+
describe "urls with indices" do
|
175
|
+
describe "matching URLS" do
|
176
|
+
TestUrls::VALID.each do |url|
|
177
|
+
it "should extract the URL #{url} and prefix it with a protocol if missing" do
|
178
|
+
extracted_urls = @extractor.extract_urls_with_indices(url)
|
179
|
+
extracted_urls.size.should == 1
|
180
|
+
extracted_url = extracted_urls.first
|
181
|
+
extracted_url[:url].should include(url)
|
182
|
+
extracted_url[:indices].first.should == 0
|
183
|
+
extracted_url[:indices].last.should == url.chars.to_a.size
|
184
|
+
end
|
185
|
+
|
186
|
+
it "should match the URL #{url} when it's embedded in other text" do
|
187
|
+
text = "Sweet url: #{url} I found. #awesome"
|
188
|
+
extracted_urls = @extractor.extract_urls_with_indices(text)
|
189
|
+
extracted_urls.size.should == 1
|
190
|
+
extracted_url = extracted_urls.first
|
191
|
+
extracted_url[:url].should include(url)
|
192
|
+
extracted_url[:indices].first.should == 11
|
193
|
+
extracted_url[:indices].last.should == 11 + url.chars.to_a.size
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
describe "invalid URLS" do
|
199
|
+
it "does not link urls with invalid domains" do
|
200
|
+
@extractor.extract_urls_with_indices("http://tld-too-short.x").should == []
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
121
205
|
describe "hashtags" do
|
122
206
|
context "extracts latin/numeric hashtags" do
|
123
207
|
%w(text text123 123text).each do |hashtag|
|
@@ -163,7 +247,6 @@ describe Twitter::Extractor do
|
|
163
247
|
end
|
164
248
|
end
|
165
249
|
end
|
166
|
-
|
167
250
|
end
|
168
251
|
|
169
252
|
it "should not extract numeric hashtags" do
|
@@ -171,4 +254,69 @@ describe Twitter::Extractor do
|
|
171
254
|
end
|
172
255
|
end
|
173
256
|
|
257
|
+
describe "hashtags with indices" do
|
258
|
+
def match_hashtag_in_text(hashtag, text, offset = 0)
|
259
|
+
extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
|
260
|
+
extracted_hashtags.size.should == 1
|
261
|
+
extracted_hashtag = extracted_hashtags.first
|
262
|
+
extracted_hashtag[:hashtag].should == hashtag
|
263
|
+
extracted_hashtag[:indices].first.should == offset
|
264
|
+
extracted_hashtag[:indices].last.should == offset + hashtag.chars.to_a.size + 1
|
265
|
+
end
|
266
|
+
|
267
|
+
def no_match_hashtag_in_text(text)
|
268
|
+
extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
|
269
|
+
extracted_hashtags.size.should == 0
|
270
|
+
end
|
271
|
+
|
272
|
+
context "extracts latin/numeric hashtags" do
|
273
|
+
%w(text text123 123text).each do |hashtag|
|
274
|
+
it "should extract ##{hashtag}" do
|
275
|
+
match_hashtag_in_text(hashtag, "##{hashtag}")
|
276
|
+
end
|
277
|
+
|
278
|
+
it "should extract ##{hashtag} within text" do
|
279
|
+
match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
context "international hashtags" do
|
285
|
+
context "should allow accents" do
|
286
|
+
%w(mañana café münchen).each do |hashtag|
|
287
|
+
it "should extract ##{hashtag}" do
|
288
|
+
match_hashtag_in_text(hashtag, "##{hashtag}")
|
289
|
+
end
|
290
|
+
|
291
|
+
it "should extract ##{hashtag} within text" do
|
292
|
+
match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
296
|
+
it "should not allow the multiplication character" do
|
297
|
+
match_hashtag_in_text('pre', "#pre#{[0xd7].pack('U')}post")
|
298
|
+
end
|
299
|
+
|
300
|
+
it "should not allow the division character" do
|
301
|
+
match_hashtag_in_text('pre', "#pre#{[0xf7].pack('U')}post")
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
context "should NOT allow Japanese" do
|
306
|
+
%w(会議中 ハッシュ).each do |hashtag|
|
307
|
+
it "should NOT extract ##{hashtag}" do
|
308
|
+
no_match_hashtag_in_text("##{hashtag}")
|
309
|
+
end
|
310
|
+
|
311
|
+
it "should NOT extract ##{hashtag} within text" do
|
312
|
+
no_match_hashtag_in_text("pre-text ##{hashtag} post-text")
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
it "should not extract numeric hashtags" do
|
319
|
+
no_match_hashtag_in_text("#1234")
|
320
|
+
end
|
321
|
+
end
|
174
322
|
end
|
data/spec/hithighlighter_spec.rb
CHANGED
data/spec/regex_spec.rb
CHANGED
data/spec/test_urls.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#encoding: UTF-8
|
1
2
|
module TestUrls
|
2
3
|
VALID = [
|
3
4
|
"http://google.com",
|
@@ -20,6 +21,8 @@ module TestUrls
|
|
20
21
|
"http://search.twitter.com/search?q=avro&lang=en",
|
21
22
|
"http://mrs.domain-dash.biz",
|
22
23
|
"http://x.com/has/one/char/domain",
|
24
|
+
"http://t.co/nwcLTFF",
|
25
|
+
# "t.co/nwcLTFF"
|
23
26
|
]
|
24
27
|
|
25
28
|
INVALID = [
|
data/spec/unicode_spec.rb
CHANGED
data/spec/validation_spec.rb
CHANGED
metadata
CHANGED
@@ -1,36 +1,45 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 1
|
7
|
+
- 1
|
8
|
+
- 4
|
9
|
+
version: 1.1.4
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- Matt Sanford
|
8
13
|
- Patrick Ewing
|
9
14
|
- Ben Cherry
|
10
15
|
- Britt Selvitelle
|
16
|
+
- Raffi Krikorian
|
11
17
|
autorequire: ""
|
12
18
|
bindir: bin
|
13
19
|
cert_chain: []
|
14
20
|
|
15
|
-
date: 2010-
|
21
|
+
date: 2010-07-08 00:00:00 -07:00
|
16
22
|
default_executable:
|
17
23
|
dependencies:
|
18
24
|
- !ruby/object:Gem::Dependency
|
19
25
|
name: actionpack
|
20
|
-
|
21
|
-
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
prerelease: false
|
27
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
28
|
requirements:
|
24
29
|
- - ">="
|
25
30
|
- !ruby/object:Gem::Version
|
31
|
+
segments:
|
32
|
+
- 0
|
26
33
|
version: "0"
|
27
|
-
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
28
36
|
description: A gem that provides text handling for Twitter
|
29
37
|
email:
|
30
38
|
- matt@twitter.com
|
31
39
|
- patrick.henry.ewing@gmail.com
|
32
40
|
- bcherry@gmail.com
|
33
41
|
- bs@brittspace.com
|
42
|
+
- raffi@twitter.com
|
34
43
|
executables: []
|
35
44
|
|
36
45
|
extensions: []
|
@@ -70,18 +79,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
70
79
|
requirements:
|
71
80
|
- - ">="
|
72
81
|
- !ruby/object:Gem::Version
|
82
|
+
segments:
|
83
|
+
- 0
|
73
84
|
version: "0"
|
74
|
-
version:
|
75
85
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
86
|
requirements:
|
77
87
|
- - ">="
|
78
88
|
- !ruby/object:Gem::Version
|
89
|
+
segments:
|
90
|
+
- 0
|
79
91
|
version: "0"
|
80
|
-
version:
|
81
92
|
requirements: []
|
82
93
|
|
83
94
|
rubyforge_project:
|
84
|
-
rubygems_version: 1.3.
|
95
|
+
rubygems_version: 1.3.6
|
85
96
|
signing_key:
|
86
97
|
specification_version: 3
|
87
98
|
summary: Twitter text handling library
|