twitter-text 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -26,7 +26,7 @@ usernames, lists, hashtags and URLs.
26
26
  # Auto-link
27
27
  class MyClass
28
28
  include Twitter::Autolink
29
-
29
+
30
30
  html = auto_link("link @user, please #request")
31
31
  end
32
32
 
@@ -69,3 +69,18 @@ To run the Conformance suite, you'll need to add that project as a git submodule
69
69
  git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
70
70
  git submodule init
71
71
  git submodule update
72
+
73
+ === Thanks
74
+
75
+ Thanks to everybody who has filed issues, provided feedback or contributed patches. Patches courtesy of:
76
+
77
+ * At Twitter …
78
+ * Raffi Krikorian - http://github.com/r
79
+ * Ben Cherry - http://github.com/bcherry
80
+ * Patrick Ewing - http://github.com/hoverbird
81
+ * Jeff Smick - http://github.com/sprsquish
82
+
83
+ * Patches from the community …
84
+ * Jean-Philippe Bougie - http://github.com/jpbougie
85
+
86
+ * Anyone who has filed an issue. It helps. Really.
data/Rakefile CHANGED
@@ -9,9 +9,9 @@ require 'digest'
9
9
 
10
10
  spec = Gem::Specification.new do |s|
11
11
  s.name = "twitter-text"
12
- s.version = "1.1.2"
13
- s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle"]
14
- s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com"]
12
+ s.version = "1.1.4"
13
+ s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle", "Raffi Krikorian"]
14
+ s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com", "raffi@twitter.com"]
15
15
  s.homepage = "http://twitter.com"
16
16
  s.description = s.summary = "A gem that provides text handling for Twitter"
17
17
 
@@ -32,6 +32,7 @@ desc "Run specs"
32
32
  Spec::Rake::SpecTask.new do |t|
33
33
  t.spec_files = FileList['spec/**/*_spec.rb']
34
34
  t.spec_opts = %w(-fs --color)
35
+ t.libs << ["spec", '.']
35
36
  end
36
37
 
37
38
  desc "Run all examples with RCov"
@@ -41,15 +42,12 @@ Spec::Rake::SpecTask.new('spec:rcov') do |t|
41
42
  t.rcov_opts = ['--exclude', 'spec']
42
43
  end
43
44
 
44
-
45
45
  def conformance_version(dir)
46
46
  Dir[File.join(dir, '*')].inject(Digest::SHA1.new){|digest, file| digest.update(Digest::SHA1.file(file).hexdigest) }
47
47
  end
48
48
 
49
49
  namespace :test do
50
50
  namespace :conformance do
51
-
52
-
53
51
  desc "Update conformance testing data"
54
52
  task :update do
55
53
  puts "Updating conformance data ... "
@@ -85,8 +83,7 @@ namespace :test do
85
83
  end
86
84
  end
87
85
 
88
-
89
- namespace :doc do
86
+ namespace :doc do
90
87
  Rake::RDocTask.new do |rd|
91
88
  rd.main = "README.rdoc"
92
89
  rd.rdoc_dir = 'doc'
data/lib/extractor.rb CHANGED
@@ -1,3 +1,41 @@
1
+ class String
2
+ # Helper function to count the character length by first converting to an
3
+ # array. This is needed because with unicode strings, the return value
4
+ # of length may be incorrect
5
+ def char_length
6
+ if respond_to? :codepoints
7
+ length
8
+ else
9
+ chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
10
+ end
11
+ end
12
+
13
+ # Helper function to convert this string into an array of unicode characters.
14
+ def to_char_a
15
+ @to_char_a ||= if chars.kind_of?(Enumerable)
16
+ chars.to_a
17
+ else
18
+ char_array = []
19
+ 0.upto(char_length - 1) { |i| char_array << [chars.slice(i)].pack('U') }
20
+ char_array
21
+ end
22
+ end
23
+
24
+ # Helper function to find the index of the <tt>sub_string</tt> in
25
+ # <tt>str</tt>. This is needed because with unicode strings, the return
26
+ # of index may be incorrect.
27
+ def sub_string_search(sub_str, position = 0)
28
+ if respond_to? :codepoints
29
+ index(sub_str, position)
30
+ else
31
+ index = to_char_a[position..-1].each_with_index.find do |e|
32
+ to_char_a.slice(e.last + position, sub_str.char_length).map{|ci| ci.first }.join == sub_str
33
+ end
34
+ index.nil? ? -1 : index.last + position
35
+ end
36
+ end
37
+ end
38
+
1
39
  module Twitter
2
40
  # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
3
41
  # of usernames, lists, URLs and hashtags.
@@ -9,13 +47,38 @@ module Twitter
9
47
  #
10
48
  # If a block is given then it will be called for each username.
11
49
  def extract_mentioned_screen_names(text) # :yields: username
50
+ screen_names_only = extract_mentioned_screen_names_with_indices(text).map{|mention| mention[:screen_name] }
51
+ screen_names_only.each{|mention| yield mention } if block_given?
52
+ screen_names_only
53
+ end
54
+
55
+ # Extracts a list of all usersnames mentioned in the Tweet <tt>text</tt>
56
+ # along with the indices for where the mention ocurred. If the
57
+ # <tt>text</tt> is nil or contains no username mentions, an empty array
58
+ # will be returned.
59
+ #
60
+ # If a block is given, then it will be called with each username, the start
61
+ # index, and the end index in the <tt>text</tt>.
62
+ def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
12
63
  return [] unless text
13
64
 
14
65
  possible_screen_names = []
15
- text.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
16
- possible_screen_names << sn unless after =~ Twitter::Regex[:at_signs]
66
+ position = 0
67
+ text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
68
+ unless after =~ Twitter::Regex[:at_signs]
69
+ start_position = text.to_s.sub_string_search(sn, position) - 1
70
+ position = start_position + sn.char_length + 1
71
+ possible_screen_names << {
72
+ :screen_name => sn,
73
+ :indices => [start_position, position]
74
+ }
75
+ end
76
+ end
77
+ if block_given?
78
+ possible_screen_names.each do |mention|
79
+ yield mention[:screen_name], mention[:indices].first, mention[:indices].last
80
+ end
17
81
  end
18
- possible_screen_names.each{|sn| yield sn } if block_given?
19
82
  possible_screen_names
20
83
  end
21
84
 
@@ -39,12 +102,30 @@ module Twitter
39
102
  #
40
103
  # If a block is given then it will be called for each URL.
41
104
  def extract_urls(text) # :yields: url
105
+ urls_only = extract_urls_with_indices(text).map{|url| url[:url] }
106
+ urls_only.each{|url| yield url } if block_given?
107
+ urls_only
108
+ end
109
+
110
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
111
+ # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
112
+ # URLs an empty array will be returned.
113
+ #
114
+ # If a block is given then it will be called for each URL.
115
+ def extract_urls_with_indices(text) # :yields: url, start, end
42
116
  return [] unless text
43
117
  urls = []
118
+ position = 0
44
119
  text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
45
- urls << (protocol == "www." ? "http://#{url}" : url)
120
+ start_position = text.to_s.sub_string_search(url, position)
121
+ end_position = start_position + url.char_length
122
+ position = end_position
123
+ urls << {
124
+ :url => (protocol == "www." ? "http://#{url}" : url),
125
+ :indices => [start_position, end_position]
126
+ }
46
127
  end
47
- urls.each{|url| yield url } if block_given?
128
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last } if block_given?
48
129
  urls
49
130
  end
50
131
 
@@ -55,15 +136,32 @@ module Twitter
55
136
  #
56
137
  # If a block is given then it will be called for each hashtag.
57
138
  def extract_hashtags(text) # :yields: hashtag_text
139
+ hashtags_only = extract_hashtags_with_indices(text).map{|hash| hash[:hashtag] }
140
+ hashtags_only.each{|hash| yield hash } if block_given?
141
+ hashtags_only
142
+ end
143
+
144
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
145
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
146
+ # will be returned. The array returned will not include the leading <tt>#</tt>
147
+ # character.
148
+ #
149
+ # If a block is given then it will be called for each hashtag.
150
+ def extract_hashtags_with_indices(text) # :yields: hashtag_text, start, end
58
151
  return [] unless text
59
152
 
60
153
  tags = []
154
+ position = 0
61
155
  text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
62
- tags << hash_text
156
+ start_position = text.to_s.sub_string_search(hash, position)
157
+ position = start_position + hash_text.char_length + 1
158
+ tags << {
159
+ :hashtag => hash_text,
160
+ :indices => [start_position, position]
161
+ }
63
162
  end
64
- tags.each{|tag| yield tag } if block_given?
163
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
65
164
  tags
66
165
  end
67
-
68
166
  end
69
167
  end
@@ -65,7 +65,7 @@ module Twitter
65
65
  end
66
66
  placed = true
67
67
  end
68
-
68
+
69
69
  # ultimate fallback, hits that run off the end get a closing tag
70
70
  if !placed
71
71
  result << tag
@@ -82,6 +82,8 @@ module Twitter
82
82
  end
83
83
 
84
84
  result
85
+ rescue
86
+ text
85
87
  end
86
88
  end
87
89
  end
data/lib/regex.rb CHANGED
@@ -1,3 +1,4 @@
1
+ # coding: UTF-8
1
2
 
2
3
  module Twitter
3
4
  # A collection of regular expressions for parsing Tweet text. The regular expression
@@ -30,7 +31,7 @@ module Twitter
30
31
  REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
31
32
  REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
32
33
 
33
- REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
34
+ REGEXEN[:list_name] = /^[a-zA-Z\u0080-\u00ff].{0,79}$/
34
35
 
35
36
  # Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
36
37
  LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
@@ -39,17 +40,24 @@ module Twitter
39
40
  # Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
40
41
  HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
41
42
  REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
42
- REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\x80-\xff\-]{0,79})?/
43
+ REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\u0080-\u00ff\-]{0,79})?/
43
44
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
44
45
 
45
46
  # URL related hash regex collection
46
47
  REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
47
48
  REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i
48
- REGEXEN[:valid_url_path_chars] = /[\.\,]?[a-z0-9!\*'\(\);:=\+\$\/%#\[\]\-_,~@]/i
49
+
50
+ # Allow URL paths to contain balanced parens
51
+ # 1. Used in Wikipedia URLs like /Primer_(film)
52
+ # 2. Used in IIS sessions like /S(dfd346)/
53
+ REGEXEN[:wikipedia_disambiguation] = /(?:\([^\)]+\))/i
54
+ REGEXEN[:valid_url_path_chars] = /(?:
55
+ #{REGEXEN[:wikipedia_disambiguation]}|
56
+ [\.\,]?[a-z0-9!\*';:=\+\$\/%#\[\]\-_,~@]
57
+ )/ix
49
58
  # Valid end-of-path chracters (so /foo. does not gobble the period).
50
- # 1. Allow ) for Wikipedia URLs.
51
- # 2. Allow =&# for empty URL parameters and other URL-join artifacts
52
- REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
59
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
60
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=#\/]/i
53
61
  REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
54
62
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
55
63
  REGEXEN[:valid_url] = %r{
@@ -58,7 +66,9 @@ module Twitter
58
66
  ( # $3 URL
59
67
  (https?:\/\/|www\.) # $4 Protocol or beginning
60
68
  (#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
61
- (/#{REGEXEN[:valid_url_path_chars]}*#{REGEXEN[:valid_url_path_ending_chars]}?)? # $6 URL Path
69
+ (/#{REGEXEN[:valid_url_path_chars]}*
70
+ #{REGEXEN[:valid_url_path_ending_chars]}?
71
+ )? # $6 URL Path
62
72
  (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $7 Query String
63
73
  )
64
74
  )
data/lib/twitter-text.rb CHANGED
@@ -1,4 +1,4 @@
1
- raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
1
+ raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE) || ''.respond_to?(:codepoints)
2
2
 
3
3
  require 'rubygems'
4
4
 
@@ -1,4 +1,6 @@
1
- require File.dirname(__FILE__) + '/spec_helper'
1
+ #encoding: UTF-8
2
+ # require File.dirname(__FILE__) + '/spec_helper'
3
+ require 'spec_helper'
2
4
 
3
5
  class TestAutolink
4
6
  include Twitter::Autolink
@@ -303,7 +305,7 @@ describe Twitter::Autolink do
303
305
 
304
306
  it "should be linked" do
305
307
  link = Hpricot(@autolinked_text).at('a')
306
- link.inner_text.should == "#{[0xFF03].pack('U')}twj_dev"
308
+ (link.inner_text.respond_to?(:force_encoding) ? link.inner_text.force_encoding("utf-8") : link.inner_text).should == "#{[0xFF03].pack('U')}twj_dev"
307
309
  link['href'].should == 'http://twitter.com/search?q=%23twj_dev'
308
310
  end
309
311
  end
@@ -329,7 +331,7 @@ describe Twitter::Autolink do
329
331
  end
330
332
  end
331
333
 
332
- context "when surrounded by parentheses;" do
334
+ context "with a path surrounded by parentheses;" do
333
335
  def original_text; "I found a neatness (#{url})"; end
334
336
 
335
337
  it "should be linked" do
@@ -340,10 +342,49 @@ describe Twitter::Autolink do
340
342
  def url; "http://www.google.com/"; end
341
343
 
342
344
  it "should be linked" do
343
- pending # our support for Wikipedia URLS containing parens breaks this corner case
344
345
  @autolinked_text.should have_autolinked_url(url)
345
346
  end
346
347
  end
348
+
349
+ context "when the URL has a path;" do
350
+ def url; "http://www.google.com/fsdfasdf"; end
351
+
352
+ it "should be linked" do
353
+ @autolinked_text.should have_autolinked_url(url)
354
+ end
355
+ end
356
+ end
357
+
358
+ context "when path contains parens" do
359
+ def original_text; "I found a neatness (#{url})"; end
360
+
361
+ it "should be linked" do
362
+ @autolinked_text.should have_autolinked_url(url)
363
+ end
364
+
365
+ context "wikipedia" do
366
+ def url; "http://en.wikipedia.org/wiki/Madonna_(artist)"; end
367
+
368
+ it "should be linked" do
369
+ @autolinked_text.should have_autolinked_url(url)
370
+ end
371
+ end
372
+
373
+ context "IIS session" do
374
+ def url; "http://msdn.com/S(deadbeef)/page.htm"; end
375
+
376
+ it "should be linked" do
377
+ @autolinked_text.should have_autolinked_url(url)
378
+ end
379
+ end
380
+
381
+ context "unbalanced parens" do
382
+ def url; "http://example.com/i_has_a_("; end
383
+
384
+ it "should be linked" do
385
+ @autolinked_text.should have_autolinked_url("http://example.com/i_has_a_")
386
+ end
387
+ end
347
388
  end
348
389
 
349
390
  context "when preceded by a :" do
@@ -1,3 +1,4 @@
1
+ #encoding: UTF-8
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  class TestExtractor
@@ -49,6 +50,58 @@ describe Twitter::Extractor do
49
50
  end
50
51
  end
51
52
 
53
+ describe "mentions with indices" do
54
+ context "single screen name alone " do
55
+ it "should be linked and the correct indices" do
56
+ @extractor.extract_mentioned_screen_names_with_indices("@alice").should == [{:screen_name => "alice", :indices => [0, 6]}]
57
+ end
58
+
59
+ it "should be linked with _ and the correct indices" do
60
+ @extractor.extract_mentioned_screen_names_with_indices("@alice_adams").should == [{:screen_name => "alice_adams", :indices => [0, 12]}]
61
+ end
62
+
63
+ it "should be linked if numeric and the correct indices" do
64
+ @extractor.extract_mentioned_screen_names_with_indices("@1234").should == [{:screen_name => "1234", :indices => [0, 5]}]
65
+ end
66
+ end
67
+
68
+ context "multiple screen names" do
69
+ it "should both be linked with the correct indices" do
70
+ @extractor.extract_mentioned_screen_names_with_indices("@alice @bob").should ==
71
+ [{:screen_name => "alice", :indices => [0, 6]},
72
+ {:screen_name => "bob", :indices => [7, 11]}]
73
+ end
74
+
75
+ it "should be linked with the correct indices even when repeated" do
76
+ @extractor.extract_mentioned_screen_names_with_indices("@alice @alice @bob").should ==
77
+ [{:screen_name => "alice", :indices => [0, 6]},
78
+ {:screen_name => "alice", :indices => [7, 13]},
79
+ {:screen_name => "bob", :indices => [14, 18]}]
80
+ end
81
+ end
82
+
83
+ context "screen names embedded in text" do
84
+ it "should be linked in Latin text with the correct indices" do
85
+ @extractor.extract_mentioned_screen_names_with_indices("waiting for @alice to arrive").should == [{:screen_name => "alice", :indices => [12, 18]}]
86
+ end
87
+
88
+ it "should be linked in Japanese text with the correct indices" do
89
+ @extractor.extract_mentioned_screen_names_with_indices("の@aliceに到着を待っている").should == [{:screen_name => "alice", :indices => [1, 7]}]
90
+ end
91
+ end
92
+
93
+ it "should accept a block arugment and call it in order" do
94
+ needed = [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}]
95
+ @extractor.extract_mentioned_screen_names_with_indices("@alice @bob") do |sn, start_index, end_index|
96
+ data = needed.shift
97
+ sn.should == data[:screen_name]
98
+ start_index.should == data[:indices].first
99
+ end_index.should == data[:indices].last
100
+ end
101
+ needed.should == []
102
+ end
103
+ end
104
+
52
105
  describe "replies" do
53
106
  context "should be extracted from" do
54
107
  it "should extract from lone name" do
@@ -118,6 +171,37 @@ describe Twitter::Extractor do
118
171
  end
119
172
  end
120
173
 
174
+ describe "urls with indices" do
175
+ describe "matching URLS" do
176
+ TestUrls::VALID.each do |url|
177
+ it "should extract the URL #{url} and prefix it with a protocol if missing" do
178
+ extracted_urls = @extractor.extract_urls_with_indices(url)
179
+ extracted_urls.size.should == 1
180
+ extracted_url = extracted_urls.first
181
+ extracted_url[:url].should include(url)
182
+ extracted_url[:indices].first.should == 0
183
+ extracted_url[:indices].last.should == url.chars.to_a.size
184
+ end
185
+
186
+ it "should match the URL #{url} when it's embedded in other text" do
187
+ text = "Sweet url: #{url} I found. #awesome"
188
+ extracted_urls = @extractor.extract_urls_with_indices(text)
189
+ extracted_urls.size.should == 1
190
+ extracted_url = extracted_urls.first
191
+ extracted_url[:url].should include(url)
192
+ extracted_url[:indices].first.should == 11
193
+ extracted_url[:indices].last.should == 11 + url.chars.to_a.size
194
+ end
195
+ end
196
+ end
197
+
198
+ describe "invalid URLS" do
199
+ it "does not link urls with invalid domains" do
200
+ @extractor.extract_urls_with_indices("http://tld-too-short.x").should == []
201
+ end
202
+ end
203
+ end
204
+
121
205
  describe "hashtags" do
122
206
  context "extracts latin/numeric hashtags" do
123
207
  %w(text text123 123text).each do |hashtag|
@@ -163,7 +247,6 @@ describe Twitter::Extractor do
163
247
  end
164
248
  end
165
249
  end
166
-
167
250
  end
168
251
 
169
252
  it "should not extract numeric hashtags" do
@@ -171,4 +254,69 @@ describe Twitter::Extractor do
171
254
  end
172
255
  end
173
256
 
257
+ describe "hashtags with indices" do
258
+ def match_hashtag_in_text(hashtag, text, offset = 0)
259
+ extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
260
+ extracted_hashtags.size.should == 1
261
+ extracted_hashtag = extracted_hashtags.first
262
+ extracted_hashtag[:hashtag].should == hashtag
263
+ extracted_hashtag[:indices].first.should == offset
264
+ extracted_hashtag[:indices].last.should == offset + hashtag.chars.to_a.size + 1
265
+ end
266
+
267
+ def no_match_hashtag_in_text(text)
268
+ extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
269
+ extracted_hashtags.size.should == 0
270
+ end
271
+
272
+ context "extracts latin/numeric hashtags" do
273
+ %w(text text123 123text).each do |hashtag|
274
+ it "should extract ##{hashtag}" do
275
+ match_hashtag_in_text(hashtag, "##{hashtag}")
276
+ end
277
+
278
+ it "should extract ##{hashtag} within text" do
279
+ match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
280
+ end
281
+ end
282
+ end
283
+
284
+ context "international hashtags" do
285
+ context "should allow accents" do
286
+ %w(mañana café münchen).each do |hashtag|
287
+ it "should extract ##{hashtag}" do
288
+ match_hashtag_in_text(hashtag, "##{hashtag}")
289
+ end
290
+
291
+ it "should extract ##{hashtag} within text" do
292
+ match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
293
+ end
294
+ end
295
+
296
+ it "should not allow the multiplication character" do
297
+ match_hashtag_in_text('pre', "#pre#{[0xd7].pack('U')}post")
298
+ end
299
+
300
+ it "should not allow the division character" do
301
+ match_hashtag_in_text('pre', "#pre#{[0xf7].pack('U')}post")
302
+ end
303
+ end
304
+
305
+ context "should NOT allow Japanese" do
306
+ %w(会議中 ハッシュ).each do |hashtag|
307
+ it "should NOT extract ##{hashtag}" do
308
+ no_match_hashtag_in_text("##{hashtag}")
309
+ end
310
+
311
+ it "should NOT extract ##{hashtag} within text" do
312
+ no_match_hashtag_in_text("pre-text ##{hashtag} post-text")
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ it "should not extract numeric hashtags" do
319
+ no_match_hashtag_in_text("#1234")
320
+ end
321
+ end
174
322
  end
@@ -1,3 +1,4 @@
1
+ #encoding: UTF-8
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  class TestHitHighlighter
data/spec/regex_spec.rb CHANGED
@@ -1,3 +1,4 @@
1
+ #encoding: UTF-8
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  describe "Twitter::Regex regular expressions" do
data/spec/test_urls.rb CHANGED
@@ -1,3 +1,4 @@
1
+ #encoding: UTF-8
1
2
  module TestUrls
2
3
  VALID = [
3
4
  "http://google.com",
@@ -20,6 +21,8 @@ module TestUrls
20
21
  "http://search.twitter.com/search?q=avro&lang=en",
21
22
  "http://mrs.domain-dash.biz",
22
23
  "http://x.com/has/one/char/domain",
24
+ "http://t.co/nwcLTFF",
25
+ # "t.co/nwcLTFF"
23
26
  ]
24
27
 
25
28
  INVALID = [
data/spec/unicode_spec.rb CHANGED
@@ -1,3 +1,4 @@
1
+ #encoding: UTF-8
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  describe Twitter::Unicode do
@@ -1,3 +1,4 @@
1
+ #encoding: BINARY
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  class TestValidation
metadata CHANGED
@@ -1,36 +1,45 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ prerelease: false
5
+ segments:
6
+ - 1
7
+ - 1
8
+ - 4
9
+ version: 1.1.4
5
10
  platform: ruby
6
11
  authors:
7
12
  - Matt Sanford
8
13
  - Patrick Ewing
9
14
  - Ben Cherry
10
15
  - Britt Selvitelle
16
+ - Raffi Krikorian
11
17
  autorequire: ""
12
18
  bindir: bin
13
19
  cert_chain: []
14
20
 
15
- date: 2010-05-11 00:00:00 -07:00
21
+ date: 2010-07-08 00:00:00 -07:00
16
22
  default_executable:
17
23
  dependencies:
18
24
  - !ruby/object:Gem::Dependency
19
25
  name: actionpack
20
- type: :runtime
21
- version_requirement:
22
- version_requirements: !ruby/object:Gem::Requirement
26
+ prerelease: false
27
+ requirement: &id001 !ruby/object:Gem::Requirement
23
28
  requirements:
24
29
  - - ">="
25
30
  - !ruby/object:Gem::Version
31
+ segments:
32
+ - 0
26
33
  version: "0"
27
- version:
34
+ type: :runtime
35
+ version_requirements: *id001
28
36
  description: A gem that provides text handling for Twitter
29
37
  email:
30
38
  - matt@twitter.com
31
39
  - patrick.henry.ewing@gmail.com
32
40
  - bcherry@gmail.com
33
41
  - bs@brittspace.com
42
+ - raffi@twitter.com
34
43
  executables: []
35
44
 
36
45
  extensions: []
@@ -70,18 +79,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
70
79
  requirements:
71
80
  - - ">="
72
81
  - !ruby/object:Gem::Version
82
+ segments:
83
+ - 0
73
84
  version: "0"
74
- version:
75
85
  required_rubygems_version: !ruby/object:Gem::Requirement
76
86
  requirements:
77
87
  - - ">="
78
88
  - !ruby/object:Gem::Version
89
+ segments:
90
+ - 0
79
91
  version: "0"
80
- version:
81
92
  requirements: []
82
93
 
83
94
  rubyforge_project:
84
- rubygems_version: 1.3.5
95
+ rubygems_version: 1.3.6
85
96
  signing_key:
86
97
  specification_version: 3
87
98
  summary: Twitter text handling library