twitter-text 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +16 -1
- data/Rakefile +5 -8
- data/lib/extractor.rb +106 -8
- data/lib/hithighlighter.rb +3 -1
- data/lib/regex.rb +17 -7
- data/lib/twitter-text.rb +1 -1
- data/spec/autolinking_spec.rb +45 -4
- data/spec/extractor_spec.rb +149 -1
- data/spec/hithighlighter_spec.rb +1 -0
- data/spec/regex_spec.rb +1 -0
- data/spec/test_urls.rb +3 -0
- data/spec/unicode_spec.rb +1 -0
- data/spec/validation_spec.rb +1 -0
- metadata +20 -9
data/README.rdoc
CHANGED
@@ -26,7 +26,7 @@ usernames, lists, hashtags and URLs.
|
|
26
26
|
# Auto-link
|
27
27
|
class MyClass
|
28
28
|
include Twitter::Autolink
|
29
|
-
|
29
|
+
|
30
30
|
html = auto_link("link @user, please #request")
|
31
31
|
end
|
32
32
|
|
@@ -69,3 +69,18 @@ To run the Conformance suite, you'll need to add that project as a git submodule
|
|
69
69
|
git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
|
70
70
|
git submodule init
|
71
71
|
git submodule update
|
72
|
+
|
73
|
+
=== Thanks
|
74
|
+
|
75
|
+
Thanks to everybody who has filed issues, provided feedback or contributed patches. Patches courtesy of:
|
76
|
+
|
77
|
+
* At Twitter …
|
78
|
+
* Raffi Krikorian - http://github.com/r
|
79
|
+
* Ben Cherry - http://github.com/bcherry
|
80
|
+
* Patrick Ewing - http://github.com/hoverbird
|
81
|
+
* Jeff Smick - http://github.com/sprsquish
|
82
|
+
|
83
|
+
* Patches from the community …
|
84
|
+
* Jean-Philippe Bougie - http://github.com/jpbougie
|
85
|
+
|
86
|
+
* Anyone who has filed an issue. It helps. Really.
|
data/Rakefile
CHANGED
@@ -9,9 +9,9 @@ require 'digest'
|
|
9
9
|
|
10
10
|
spec = Gem::Specification.new do |s|
|
11
11
|
s.name = "twitter-text"
|
12
|
-
s.version = "1.1.
|
13
|
-
s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle"]
|
14
|
-
s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com"]
|
12
|
+
s.version = "1.1.4"
|
13
|
+
s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle", "Raffi Krikorian"]
|
14
|
+
s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com", "raffi@twitter.com"]
|
15
15
|
s.homepage = "http://twitter.com"
|
16
16
|
s.description = s.summary = "A gem that provides text handling for Twitter"
|
17
17
|
|
@@ -32,6 +32,7 @@ desc "Run specs"
|
|
32
32
|
Spec::Rake::SpecTask.new do |t|
|
33
33
|
t.spec_files = FileList['spec/**/*_spec.rb']
|
34
34
|
t.spec_opts = %w(-fs --color)
|
35
|
+
t.libs << ["spec", '.']
|
35
36
|
end
|
36
37
|
|
37
38
|
desc "Run all examples with RCov"
|
@@ -41,15 +42,12 @@ Spec::Rake::SpecTask.new('spec:rcov') do |t|
|
|
41
42
|
t.rcov_opts = ['--exclude', 'spec']
|
42
43
|
end
|
43
44
|
|
44
|
-
|
45
45
|
def conformance_version(dir)
|
46
46
|
Dir[File.join(dir, '*')].inject(Digest::SHA1.new){|digest, file| digest.update(Digest::SHA1.file(file).hexdigest) }
|
47
47
|
end
|
48
48
|
|
49
49
|
namespace :test do
|
50
50
|
namespace :conformance do
|
51
|
-
|
52
|
-
|
53
51
|
desc "Update conformance testing data"
|
54
52
|
task :update do
|
55
53
|
puts "Updating conformance data ... "
|
@@ -85,8 +83,7 @@ namespace :test do
|
|
85
83
|
end
|
86
84
|
end
|
87
85
|
|
88
|
-
|
89
|
-
namespace :doc do
|
86
|
+
namespace :doc do
|
90
87
|
Rake::RDocTask.new do |rd|
|
91
88
|
rd.main = "README.rdoc"
|
92
89
|
rd.rdoc_dir = 'doc'
|
data/lib/extractor.rb
CHANGED
@@ -1,3 +1,41 @@
|
|
1
|
+
class String
|
2
|
+
# Helper function to count the character length by first converting to an
|
3
|
+
# array. This is needed because with unicode strings, the return value
|
4
|
+
# of length may be incorrect
|
5
|
+
def char_length
|
6
|
+
if respond_to? :codepoints
|
7
|
+
length
|
8
|
+
else
|
9
|
+
chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# Helper function to convert this string into an array of unicode characters.
|
14
|
+
def to_char_a
|
15
|
+
@to_char_a ||= if chars.kind_of?(Enumerable)
|
16
|
+
chars.to_a
|
17
|
+
else
|
18
|
+
char_array = []
|
19
|
+
0.upto(char_length - 1) { |i| char_array << [chars.slice(i)].pack('U') }
|
20
|
+
char_array
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Helper function to find the index of the <tt>sub_string</tt> in
|
25
|
+
# <tt>str</tt>. This is needed because with unicode strings, the return
|
26
|
+
# of index may be incorrect.
|
27
|
+
def sub_string_search(sub_str, position = 0)
|
28
|
+
if respond_to? :codepoints
|
29
|
+
index(sub_str, position)
|
30
|
+
else
|
31
|
+
index = to_char_a[position..-1].each_with_index.find do |e|
|
32
|
+
to_char_a.slice(e.last + position, sub_str.char_length).map{|ci| ci.first }.join == sub_str
|
33
|
+
end
|
34
|
+
index.nil? ? -1 : index.last + position
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
1
39
|
module Twitter
|
2
40
|
# A module for including Tweet parsing in a class. This module provides function for the extraction and processing
|
3
41
|
# of usernames, lists, URLs and hashtags.
|
@@ -9,13 +47,38 @@ module Twitter
|
|
9
47
|
#
|
10
48
|
# If a block is given then it will be called for each username.
|
11
49
|
def extract_mentioned_screen_names(text) # :yields: username
|
50
|
+
screen_names_only = extract_mentioned_screen_names_with_indices(text).map{|mention| mention[:screen_name] }
|
51
|
+
screen_names_only.each{|mention| yield mention } if block_given?
|
52
|
+
screen_names_only
|
53
|
+
end
|
54
|
+
|
55
|
+
# Extracts a list of all usersnames mentioned in the Tweet <tt>text</tt>
|
56
|
+
# along with the indices for where the mention ocurred. If the
|
57
|
+
# <tt>text</tt> is nil or contains no username mentions, an empty array
|
58
|
+
# will be returned.
|
59
|
+
#
|
60
|
+
# If a block is given, then it will be called with each username, the start
|
61
|
+
# index, and the end index in the <tt>text</tt>.
|
62
|
+
def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
|
12
63
|
return [] unless text
|
13
64
|
|
14
65
|
possible_screen_names = []
|
15
|
-
|
16
|
-
|
66
|
+
position = 0
|
67
|
+
text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
|
68
|
+
unless after =~ Twitter::Regex[:at_signs]
|
69
|
+
start_position = text.to_s.sub_string_search(sn, position) - 1
|
70
|
+
position = start_position + sn.char_length + 1
|
71
|
+
possible_screen_names << {
|
72
|
+
:screen_name => sn,
|
73
|
+
:indices => [start_position, position]
|
74
|
+
}
|
75
|
+
end
|
76
|
+
end
|
77
|
+
if block_given?
|
78
|
+
possible_screen_names.each do |mention|
|
79
|
+
yield mention[:screen_name], mention[:indices].first, mention[:indices].last
|
80
|
+
end
|
17
81
|
end
|
18
|
-
possible_screen_names.each{|sn| yield sn } if block_given?
|
19
82
|
possible_screen_names
|
20
83
|
end
|
21
84
|
|
@@ -39,12 +102,30 @@ module Twitter
|
|
39
102
|
#
|
40
103
|
# If a block is given then it will be called for each URL.
|
41
104
|
def extract_urls(text) # :yields: url
|
105
|
+
urls_only = extract_urls_with_indices(text).map{|url| url[:url] }
|
106
|
+
urls_only.each{|url| yield url } if block_given?
|
107
|
+
urls_only
|
108
|
+
end
|
109
|
+
|
110
|
+
# Extracts a list of all URLs included in the Tweet <tt>text</tt> along
|
111
|
+
# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
|
112
|
+
# URLs an empty array will be returned.
|
113
|
+
#
|
114
|
+
# If a block is given then it will be called for each URL.
|
115
|
+
def extract_urls_with_indices(text) # :yields: url, start, end
|
42
116
|
return [] unless text
|
43
117
|
urls = []
|
118
|
+
position = 0
|
44
119
|
text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
|
45
|
-
|
120
|
+
start_position = text.to_s.sub_string_search(url, position)
|
121
|
+
end_position = start_position + url.char_length
|
122
|
+
position = end_position
|
123
|
+
urls << {
|
124
|
+
:url => (protocol == "www." ? "http://#{url}" : url),
|
125
|
+
:indices => [start_position, end_position]
|
126
|
+
}
|
46
127
|
end
|
47
|
-
urls.each{|url| yield url } if block_given?
|
128
|
+
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last } if block_given?
|
48
129
|
urls
|
49
130
|
end
|
50
131
|
|
@@ -55,15 +136,32 @@ module Twitter
|
|
55
136
|
#
|
56
137
|
# If a block is given then it will be called for each hashtag.
|
57
138
|
def extract_hashtags(text) # :yields: hashtag_text
|
139
|
+
hashtags_only = extract_hashtags_with_indices(text).map{|hash| hash[:hashtag] }
|
140
|
+
hashtags_only.each{|hash| yield hash } if block_given?
|
141
|
+
hashtags_only
|
142
|
+
end
|
143
|
+
|
144
|
+
# Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
|
145
|
+
# <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
|
146
|
+
# will be returned. The array returned will not include the leading <tt>#</tt>
|
147
|
+
# character.
|
148
|
+
#
|
149
|
+
# If a block is given then it will be called for each hashtag.
|
150
|
+
def extract_hashtags_with_indices(text) # :yields: hashtag_text, start, end
|
58
151
|
return [] unless text
|
59
152
|
|
60
153
|
tags = []
|
154
|
+
position = 0
|
61
155
|
text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
|
62
|
-
|
156
|
+
start_position = text.to_s.sub_string_search(hash, position)
|
157
|
+
position = start_position + hash_text.char_length + 1
|
158
|
+
tags << {
|
159
|
+
:hashtag => hash_text,
|
160
|
+
:indices => [start_position, position]
|
161
|
+
}
|
63
162
|
end
|
64
|
-
tags.each{|tag| yield tag } if block_given?
|
163
|
+
tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
|
65
164
|
tags
|
66
165
|
end
|
67
|
-
|
68
166
|
end
|
69
167
|
end
|
data/lib/hithighlighter.rb
CHANGED
@@ -65,7 +65,7 @@ module Twitter
|
|
65
65
|
end
|
66
66
|
placed = true
|
67
67
|
end
|
68
|
-
|
68
|
+
|
69
69
|
# ultimate fallback, hits that run off the end get a closing tag
|
70
70
|
if !placed
|
71
71
|
result << tag
|
@@ -82,6 +82,8 @@ module Twitter
|
|
82
82
|
end
|
83
83
|
|
84
84
|
result
|
85
|
+
rescue
|
86
|
+
text
|
85
87
|
end
|
86
88
|
end
|
87
89
|
end
|
data/lib/regex.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# coding: UTF-8
|
1
2
|
|
2
3
|
module Twitter
|
3
4
|
# A collection of regular expressions for parsing Tweet text. The regular expression
|
@@ -30,7 +31,7 @@ module Twitter
|
|
30
31
|
REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
|
31
32
|
REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
|
32
33
|
|
33
|
-
REGEXEN[:list_name] = /^[a-zA-Z\
|
34
|
+
REGEXEN[:list_name] = /^[a-zA-Z\u0080-\u00ff].{0,79}$/
|
34
35
|
|
35
36
|
# Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
|
36
37
|
LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
|
@@ -39,17 +40,24 @@ module Twitter
|
|
39
40
|
# Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
|
40
41
|
HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
|
41
42
|
REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
|
42
|
-
REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\
|
43
|
+
REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\u0080-\u00ff\-]{0,79})?/
|
43
44
|
REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
|
44
45
|
|
45
46
|
# URL related hash regex collection
|
46
47
|
REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
|
47
48
|
REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i
|
48
|
-
|
49
|
+
|
50
|
+
# Allow URL paths to contain balanced parens
|
51
|
+
# 1. Used in Wikipedia URLs like /Primer_(film)
|
52
|
+
# 2. Used in IIS sessions like /S(dfd346)/
|
53
|
+
REGEXEN[:wikipedia_disambiguation] = /(?:\([^\)]+\))/i
|
54
|
+
REGEXEN[:valid_url_path_chars] = /(?:
|
55
|
+
#{REGEXEN[:wikipedia_disambiguation]}|
|
56
|
+
[\.\,]?[a-z0-9!\*';:=\+\$\/%#\[\]\-_,~@]
|
57
|
+
)/ix
|
49
58
|
# Valid end-of-path chracters (so /foo. does not gobble the period).
|
50
|
-
# 1. Allow
|
51
|
-
|
52
|
-
REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
|
59
|
+
# 1. Allow =&# for empty URL parameters and other URL-join artifacts
|
60
|
+
REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=#\/]/i
|
53
61
|
REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
|
54
62
|
REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
|
55
63
|
REGEXEN[:valid_url] = %r{
|
@@ -58,7 +66,9 @@ module Twitter
|
|
58
66
|
( # $3 URL
|
59
67
|
(https?:\/\/|www\.) # $4 Protocol or beginning
|
60
68
|
(#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
|
61
|
-
(/#{REGEXEN[:valid_url_path_chars]}
|
69
|
+
(/#{REGEXEN[:valid_url_path_chars]}*
|
70
|
+
#{REGEXEN[:valid_url_path_ending_chars]}?
|
71
|
+
)? # $6 URL Path
|
62
72
|
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $7 Query String
|
63
73
|
)
|
64
74
|
)
|
data/lib/twitter-text.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
|
1
|
+
raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE) || ''.respond_to?(:codepoints)
|
2
2
|
|
3
3
|
require 'rubygems'
|
4
4
|
|
data/spec/autolinking_spec.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
#encoding: UTF-8
|
2
|
+
# require File.dirname(__FILE__) + '/spec_helper'
|
3
|
+
require 'spec_helper'
|
2
4
|
|
3
5
|
class TestAutolink
|
4
6
|
include Twitter::Autolink
|
@@ -303,7 +305,7 @@ describe Twitter::Autolink do
|
|
303
305
|
|
304
306
|
it "should be linked" do
|
305
307
|
link = Hpricot(@autolinked_text).at('a')
|
306
|
-
link.inner_text.should == "#{[0xFF03].pack('U')}twj_dev"
|
308
|
+
(link.inner_text.respond_to?(:force_encoding) ? link.inner_text.force_encoding("utf-8") : link.inner_text).should == "#{[0xFF03].pack('U')}twj_dev"
|
307
309
|
link['href'].should == 'http://twitter.com/search?q=%23twj_dev'
|
308
310
|
end
|
309
311
|
end
|
@@ -329,7 +331,7 @@ describe Twitter::Autolink do
|
|
329
331
|
end
|
330
332
|
end
|
331
333
|
|
332
|
-
context "
|
334
|
+
context "with a path surrounded by parentheses;" do
|
333
335
|
def original_text; "I found a neatness (#{url})"; end
|
334
336
|
|
335
337
|
it "should be linked" do
|
@@ -340,10 +342,49 @@ describe Twitter::Autolink do
|
|
340
342
|
def url; "http://www.google.com/"; end
|
341
343
|
|
342
344
|
it "should be linked" do
|
343
|
-
pending # our support for Wikipedia URLS containing parens breaks this corner case
|
344
345
|
@autolinked_text.should have_autolinked_url(url)
|
345
346
|
end
|
346
347
|
end
|
348
|
+
|
349
|
+
context "when the URL has a path;" do
|
350
|
+
def url; "http://www.google.com/fsdfasdf"; end
|
351
|
+
|
352
|
+
it "should be linked" do
|
353
|
+
@autolinked_text.should have_autolinked_url(url)
|
354
|
+
end
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
context "when path contains parens" do
|
359
|
+
def original_text; "I found a neatness (#{url})"; end
|
360
|
+
|
361
|
+
it "should be linked" do
|
362
|
+
@autolinked_text.should have_autolinked_url(url)
|
363
|
+
end
|
364
|
+
|
365
|
+
context "wikipedia" do
|
366
|
+
def url; "http://en.wikipedia.org/wiki/Madonna_(artist)"; end
|
367
|
+
|
368
|
+
it "should be linked" do
|
369
|
+
@autolinked_text.should have_autolinked_url(url)
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
373
|
+
context "IIS session" do
|
374
|
+
def url; "http://msdn.com/S(deadbeef)/page.htm"; end
|
375
|
+
|
376
|
+
it "should be linked" do
|
377
|
+
@autolinked_text.should have_autolinked_url(url)
|
378
|
+
end
|
379
|
+
end
|
380
|
+
|
381
|
+
context "unbalanced parens" do
|
382
|
+
def url; "http://example.com/i_has_a_("; end
|
383
|
+
|
384
|
+
it "should be linked" do
|
385
|
+
@autolinked_text.should have_autolinked_url("http://example.com/i_has_a_")
|
386
|
+
end
|
387
|
+
end
|
347
388
|
end
|
348
389
|
|
349
390
|
context "when preceded by a :" do
|
data/spec/extractor_spec.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#encoding: UTF-8
|
1
2
|
require File.dirname(__FILE__) + '/spec_helper'
|
2
3
|
|
3
4
|
class TestExtractor
|
@@ -49,6 +50,58 @@ describe Twitter::Extractor do
|
|
49
50
|
end
|
50
51
|
end
|
51
52
|
|
53
|
+
describe "mentions with indices" do
|
54
|
+
context "single screen name alone " do
|
55
|
+
it "should be linked and the correct indices" do
|
56
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice").should == [{:screen_name => "alice", :indices => [0, 6]}]
|
57
|
+
end
|
58
|
+
|
59
|
+
it "should be linked with _ and the correct indices" do
|
60
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice_adams").should == [{:screen_name => "alice_adams", :indices => [0, 12]}]
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should be linked if numeric and the correct indices" do
|
64
|
+
@extractor.extract_mentioned_screen_names_with_indices("@1234").should == [{:screen_name => "1234", :indices => [0, 5]}]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
context "multiple screen names" do
|
69
|
+
it "should both be linked with the correct indices" do
|
70
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice @bob").should ==
|
71
|
+
[{:screen_name => "alice", :indices => [0, 6]},
|
72
|
+
{:screen_name => "bob", :indices => [7, 11]}]
|
73
|
+
end
|
74
|
+
|
75
|
+
it "should be linked with the correct indices even when repeated" do
|
76
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice @alice @bob").should ==
|
77
|
+
[{:screen_name => "alice", :indices => [0, 6]},
|
78
|
+
{:screen_name => "alice", :indices => [7, 13]},
|
79
|
+
{:screen_name => "bob", :indices => [14, 18]}]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
context "screen names embedded in text" do
|
84
|
+
it "should be linked in Latin text with the correct indices" do
|
85
|
+
@extractor.extract_mentioned_screen_names_with_indices("waiting for @alice to arrive").should == [{:screen_name => "alice", :indices => [12, 18]}]
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should be linked in Japanese text with the correct indices" do
|
89
|
+
@extractor.extract_mentioned_screen_names_with_indices("の@aliceに到着を待っている").should == [{:screen_name => "alice", :indices => [1, 7]}]
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
it "should accept a block arugment and call it in order" do
|
94
|
+
needed = [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}]
|
95
|
+
@extractor.extract_mentioned_screen_names_with_indices("@alice @bob") do |sn, start_index, end_index|
|
96
|
+
data = needed.shift
|
97
|
+
sn.should == data[:screen_name]
|
98
|
+
start_index.should == data[:indices].first
|
99
|
+
end_index.should == data[:indices].last
|
100
|
+
end
|
101
|
+
needed.should == []
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
52
105
|
describe "replies" do
|
53
106
|
context "should be extracted from" do
|
54
107
|
it "should extract from lone name" do
|
@@ -118,6 +171,37 @@ describe Twitter::Extractor do
|
|
118
171
|
end
|
119
172
|
end
|
120
173
|
|
174
|
+
describe "urls with indices" do
|
175
|
+
describe "matching URLS" do
|
176
|
+
TestUrls::VALID.each do |url|
|
177
|
+
it "should extract the URL #{url} and prefix it with a protocol if missing" do
|
178
|
+
extracted_urls = @extractor.extract_urls_with_indices(url)
|
179
|
+
extracted_urls.size.should == 1
|
180
|
+
extracted_url = extracted_urls.first
|
181
|
+
extracted_url[:url].should include(url)
|
182
|
+
extracted_url[:indices].first.should == 0
|
183
|
+
extracted_url[:indices].last.should == url.chars.to_a.size
|
184
|
+
end
|
185
|
+
|
186
|
+
it "should match the URL #{url} when it's embedded in other text" do
|
187
|
+
text = "Sweet url: #{url} I found. #awesome"
|
188
|
+
extracted_urls = @extractor.extract_urls_with_indices(text)
|
189
|
+
extracted_urls.size.should == 1
|
190
|
+
extracted_url = extracted_urls.first
|
191
|
+
extracted_url[:url].should include(url)
|
192
|
+
extracted_url[:indices].first.should == 11
|
193
|
+
extracted_url[:indices].last.should == 11 + url.chars.to_a.size
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
describe "invalid URLS" do
|
199
|
+
it "does not link urls with invalid domains" do
|
200
|
+
@extractor.extract_urls_with_indices("http://tld-too-short.x").should == []
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
121
205
|
describe "hashtags" do
|
122
206
|
context "extracts latin/numeric hashtags" do
|
123
207
|
%w(text text123 123text).each do |hashtag|
|
@@ -163,7 +247,6 @@ describe Twitter::Extractor do
|
|
163
247
|
end
|
164
248
|
end
|
165
249
|
end
|
166
|
-
|
167
250
|
end
|
168
251
|
|
169
252
|
it "should not extract numeric hashtags" do
|
@@ -171,4 +254,69 @@ describe Twitter::Extractor do
|
|
171
254
|
end
|
172
255
|
end
|
173
256
|
|
257
|
+
describe "hashtags with indices" do
|
258
|
+
def match_hashtag_in_text(hashtag, text, offset = 0)
|
259
|
+
extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
|
260
|
+
extracted_hashtags.size.should == 1
|
261
|
+
extracted_hashtag = extracted_hashtags.first
|
262
|
+
extracted_hashtag[:hashtag].should == hashtag
|
263
|
+
extracted_hashtag[:indices].first.should == offset
|
264
|
+
extracted_hashtag[:indices].last.should == offset + hashtag.chars.to_a.size + 1
|
265
|
+
end
|
266
|
+
|
267
|
+
def no_match_hashtag_in_text(text)
|
268
|
+
extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
|
269
|
+
extracted_hashtags.size.should == 0
|
270
|
+
end
|
271
|
+
|
272
|
+
context "extracts latin/numeric hashtags" do
|
273
|
+
%w(text text123 123text).each do |hashtag|
|
274
|
+
it "should extract ##{hashtag}" do
|
275
|
+
match_hashtag_in_text(hashtag, "##{hashtag}")
|
276
|
+
end
|
277
|
+
|
278
|
+
it "should extract ##{hashtag} within text" do
|
279
|
+
match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
context "international hashtags" do
|
285
|
+
context "should allow accents" do
|
286
|
+
%w(mañana café münchen).each do |hashtag|
|
287
|
+
it "should extract ##{hashtag}" do
|
288
|
+
match_hashtag_in_text(hashtag, "##{hashtag}")
|
289
|
+
end
|
290
|
+
|
291
|
+
it "should extract ##{hashtag} within text" do
|
292
|
+
match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
296
|
+
it "should not allow the multiplication character" do
|
297
|
+
match_hashtag_in_text('pre', "#pre#{[0xd7].pack('U')}post")
|
298
|
+
end
|
299
|
+
|
300
|
+
it "should not allow the division character" do
|
301
|
+
match_hashtag_in_text('pre', "#pre#{[0xf7].pack('U')}post")
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
context "should NOT allow Japanese" do
|
306
|
+
%w(会議中 ハッシュ).each do |hashtag|
|
307
|
+
it "should NOT extract ##{hashtag}" do
|
308
|
+
no_match_hashtag_in_text("##{hashtag}")
|
309
|
+
end
|
310
|
+
|
311
|
+
it "should NOT extract ##{hashtag} within text" do
|
312
|
+
no_match_hashtag_in_text("pre-text ##{hashtag} post-text")
|
313
|
+
end
|
314
|
+
end
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
it "should not extract numeric hashtags" do
|
319
|
+
no_match_hashtag_in_text("#1234")
|
320
|
+
end
|
321
|
+
end
|
174
322
|
end
|
data/spec/hithighlighter_spec.rb
CHANGED
data/spec/regex_spec.rb
CHANGED
data/spec/test_urls.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#encoding: UTF-8
|
1
2
|
module TestUrls
|
2
3
|
VALID = [
|
3
4
|
"http://google.com",
|
@@ -20,6 +21,8 @@ module TestUrls
|
|
20
21
|
"http://search.twitter.com/search?q=avro&lang=en",
|
21
22
|
"http://mrs.domain-dash.biz",
|
22
23
|
"http://x.com/has/one/char/domain",
|
24
|
+
"http://t.co/nwcLTFF",
|
25
|
+
# "t.co/nwcLTFF"
|
23
26
|
]
|
24
27
|
|
25
28
|
INVALID = [
|
data/spec/unicode_spec.rb
CHANGED
data/spec/validation_spec.rb
CHANGED
metadata
CHANGED
@@ -1,36 +1,45 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twitter-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 1
|
7
|
+
- 1
|
8
|
+
- 4
|
9
|
+
version: 1.1.4
|
5
10
|
platform: ruby
|
6
11
|
authors:
|
7
12
|
- Matt Sanford
|
8
13
|
- Patrick Ewing
|
9
14
|
- Ben Cherry
|
10
15
|
- Britt Selvitelle
|
16
|
+
- Raffi Krikorian
|
11
17
|
autorequire: ""
|
12
18
|
bindir: bin
|
13
19
|
cert_chain: []
|
14
20
|
|
15
|
-
date: 2010-
|
21
|
+
date: 2010-07-08 00:00:00 -07:00
|
16
22
|
default_executable:
|
17
23
|
dependencies:
|
18
24
|
- !ruby/object:Gem::Dependency
|
19
25
|
name: actionpack
|
20
|
-
|
21
|
-
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
prerelease: false
|
27
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
28
|
requirements:
|
24
29
|
- - ">="
|
25
30
|
- !ruby/object:Gem::Version
|
31
|
+
segments:
|
32
|
+
- 0
|
26
33
|
version: "0"
|
27
|
-
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
28
36
|
description: A gem that provides text handling for Twitter
|
29
37
|
email:
|
30
38
|
- matt@twitter.com
|
31
39
|
- patrick.henry.ewing@gmail.com
|
32
40
|
- bcherry@gmail.com
|
33
41
|
- bs@brittspace.com
|
42
|
+
- raffi@twitter.com
|
34
43
|
executables: []
|
35
44
|
|
36
45
|
extensions: []
|
@@ -70,18 +79,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
70
79
|
requirements:
|
71
80
|
- - ">="
|
72
81
|
- !ruby/object:Gem::Version
|
82
|
+
segments:
|
83
|
+
- 0
|
73
84
|
version: "0"
|
74
|
-
version:
|
75
85
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
86
|
requirements:
|
77
87
|
- - ">="
|
78
88
|
- !ruby/object:Gem::Version
|
89
|
+
segments:
|
90
|
+
- 0
|
79
91
|
version: "0"
|
80
|
-
version:
|
81
92
|
requirements: []
|
82
93
|
|
83
94
|
rubyforge_project:
|
84
|
-
rubygems_version: 1.3.
|
95
|
+
rubygems_version: 1.3.6
|
85
96
|
signing_key:
|
86
97
|
specification_version: 3
|
87
98
|
summary: Twitter text handling library
|