twitter-text 1.1.2 → 1.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -26,7 +26,7 @@ usernames, lists, hashtags and URLs.
26
26
  # Auto-link
27
27
  class MyClass
28
28
  include Twitter::Autolink
29
-
29
+
30
30
  html = auto_link("link @user, please #request")
31
31
  end
32
32
 
@@ -69,3 +69,18 @@ To run the Conformance suite, you'll need to add that project as a git submodule
69
69
  git submodule add git@github.com:mzsanford/twitter-text-conformance.git test/twitter-text-conformance/
70
70
  git submodule init
71
71
  git submodule update
72
+
73
+ === Thanks
74
+
75
+ Thanks to everybody who has filed issues, provided feedback or contributed patches. Patches courtesy of:
76
+
77
+ * At Twitter …
78
+ * Raffi Krikorian - http://github.com/r
79
+ * Ben Cherry - http://github.com/bcherry
80
+ * Patrick Ewing - http://github.com/hoverbird
81
+ * Jeff Smick - http://github.com/sprsquish
82
+
83
+ * Patches from the community …
84
+ * Jean-Philippe Bougie - http://github.com/jpbougie
85
+
86
+ * Anyone who has filed an issue. It helps. Really.
data/Rakefile CHANGED
@@ -9,9 +9,9 @@ require 'digest'
9
9
 
10
10
  spec = Gem::Specification.new do |s|
11
11
  s.name = "twitter-text"
12
- s.version = "1.1.2"
13
- s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle"]
14
- s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com"]
12
+ s.version = "1.1.4"
13
+ s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle", "Raffi Krikorian"]
14
+ s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com", "raffi@twitter.com"]
15
15
  s.homepage = "http://twitter.com"
16
16
  s.description = s.summary = "A gem that provides text handling for Twitter"
17
17
 
@@ -32,6 +32,7 @@ desc "Run specs"
32
32
  Spec::Rake::SpecTask.new do |t|
33
33
  t.spec_files = FileList['spec/**/*_spec.rb']
34
34
  t.spec_opts = %w(-fs --color)
35
+ t.libs << ["spec", '.']
35
36
  end
36
37
 
37
38
  desc "Run all examples with RCov"
@@ -41,15 +42,12 @@ Spec::Rake::SpecTask.new('spec:rcov') do |t|
41
42
  t.rcov_opts = ['--exclude', 'spec']
42
43
  end
43
44
 
44
-
45
45
  def conformance_version(dir)
46
46
  Dir[File.join(dir, '*')].inject(Digest::SHA1.new){|digest, file| digest.update(Digest::SHA1.file(file).hexdigest) }
47
47
  end
48
48
 
49
49
  namespace :test do
50
50
  namespace :conformance do
51
-
52
-
53
51
  desc "Update conformance testing data"
54
52
  task :update do
55
53
  puts "Updating conformance data ... "
@@ -85,8 +83,7 @@ namespace :test do
85
83
  end
86
84
  end
87
85
 
88
-
89
- namespace :doc do
86
+ namespace :doc do
90
87
  Rake::RDocTask.new do |rd|
91
88
  rd.main = "README.rdoc"
92
89
  rd.rdoc_dir = 'doc'
data/lib/extractor.rb CHANGED
@@ -1,3 +1,41 @@
1
+ class String
2
+ # Helper function to count the character length by first converting to an
3
+ # array. This is needed because with unicode strings, the return value
4
+ # of length may be incorrect
5
+ def char_length
6
+ if respond_to? :codepoints
7
+ length
8
+ else
9
+ chars.kind_of?(Enumerable) ? chars.to_a.size : chars.size
10
+ end
11
+ end
12
+
13
+ # Helper function to convert this string into an array of unicode characters.
14
+ def to_char_a
15
+ @to_char_a ||= if chars.kind_of?(Enumerable)
16
+ chars.to_a
17
+ else
18
+ char_array = []
19
+ 0.upto(char_length - 1) { |i| char_array << [chars.slice(i)].pack('U') }
20
+ char_array
21
+ end
22
+ end
23
+
24
+ # Helper function to find the index of the <tt>sub_string</tt> in
25
+ # <tt>str</tt>. This is needed because with unicode strings, the return
26
+ # of index may be incorrect.
27
+ def sub_string_search(sub_str, position = 0)
28
+ if respond_to? :codepoints
29
+ index(sub_str, position)
30
+ else
31
+ index = to_char_a[position..-1].each_with_index.find do |e|
32
+ to_char_a.slice(e.last + position, sub_str.char_length).map{|ci| ci.first }.join == sub_str
33
+ end
34
+ index.nil? ? -1 : index.last + position
35
+ end
36
+ end
37
+ end
38
+
1
39
  module Twitter
2
40
  # A module for including Tweet parsing in a class. This module provides function for the extraction and processing
3
41
  # of usernames, lists, URLs and hashtags.
@@ -9,13 +47,38 @@ module Twitter
9
47
  #
10
48
  # If a block is given then it will be called for each username.
11
49
  def extract_mentioned_screen_names(text) # :yields: username
50
+ screen_names_only = extract_mentioned_screen_names_with_indices(text).map{|mention| mention[:screen_name] }
51
+ screen_names_only.each{|mention| yield mention } if block_given?
52
+ screen_names_only
53
+ end
54
+
55
+ # Extracts a list of all usersnames mentioned in the Tweet <tt>text</tt>
56
+ # along with the indices for where the mention ocurred. If the
57
+ # <tt>text</tt> is nil or contains no username mentions, an empty array
58
+ # will be returned.
59
+ #
60
+ # If a block is given, then it will be called with each username, the start
61
+ # index, and the end index in the <tt>text</tt>.
62
+ def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end
12
63
  return [] unless text
13
64
 
14
65
  possible_screen_names = []
15
- text.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
16
- possible_screen_names << sn unless after =~ Twitter::Regex[:at_signs]
66
+ position = 0
67
+ text.to_s.scan(Twitter::Regex[:extract_mentions]) do |before, sn, after|
68
+ unless after =~ Twitter::Regex[:at_signs]
69
+ start_position = text.to_s.sub_string_search(sn, position) - 1
70
+ position = start_position + sn.char_length + 1
71
+ possible_screen_names << {
72
+ :screen_name => sn,
73
+ :indices => [start_position, position]
74
+ }
75
+ end
76
+ end
77
+ if block_given?
78
+ possible_screen_names.each do |mention|
79
+ yield mention[:screen_name], mention[:indices].first, mention[:indices].last
80
+ end
17
81
  end
18
- possible_screen_names.each{|sn| yield sn } if block_given?
19
82
  possible_screen_names
20
83
  end
21
84
 
@@ -39,12 +102,30 @@ module Twitter
39
102
  #
40
103
  # If a block is given then it will be called for each URL.
41
104
  def extract_urls(text) # :yields: url
105
+ urls_only = extract_urls_with_indices(text).map{|url| url[:url] }
106
+ urls_only.each{|url| yield url } if block_given?
107
+ urls_only
108
+ end
109
+
110
+ # Extracts a list of all URLs included in the Tweet <tt>text</tt> along
111
+ # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
112
+ # URLs an empty array will be returned.
113
+ #
114
+ # If a block is given then it will be called for each URL.
115
+ def extract_urls_with_indices(text) # :yields: url, start, end
42
116
  return [] unless text
43
117
  urls = []
118
+ position = 0
44
119
  text.to_s.scan(Twitter::Regex[:valid_url]) do |all, before, url, protocol, domain, path, query|
45
- urls << (protocol == "www." ? "http://#{url}" : url)
120
+ start_position = text.to_s.sub_string_search(url, position)
121
+ end_position = start_position + url.char_length
122
+ position = end_position
123
+ urls << {
124
+ :url => (protocol == "www." ? "http://#{url}" : url),
125
+ :indices => [start_position, end_position]
126
+ }
46
127
  end
47
- urls.each{|url| yield url } if block_given?
128
+ urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last } if block_given?
48
129
  urls
49
130
  end
50
131
 
@@ -55,15 +136,32 @@ module Twitter
55
136
  #
56
137
  # If a block is given then it will be called for each hashtag.
57
138
  def extract_hashtags(text) # :yields: hashtag_text
139
+ hashtags_only = extract_hashtags_with_indices(text).map{|hash| hash[:hashtag] }
140
+ hashtags_only.each{|hash| yield hash } if block_given?
141
+ hashtags_only
142
+ end
143
+
144
+ # Extracts a list of all hashtags included in the Tweet <tt>text</tt>. If the
145
+ # <tt>text</tt> is <tt>nil</tt> or contains no hashtags an empty array
146
+ # will be returned. The array returned will not include the leading <tt>#</tt>
147
+ # character.
148
+ #
149
+ # If a block is given then it will be called for each hashtag.
150
+ def extract_hashtags_with_indices(text) # :yields: hashtag_text, start, end
58
151
  return [] unless text
59
152
 
60
153
  tags = []
154
+ position = 0
61
155
  text.scan(Twitter::Regex[:auto_link_hashtags]) do |before, hash, hash_text|
62
- tags << hash_text
156
+ start_position = text.to_s.sub_string_search(hash, position)
157
+ position = start_position + hash_text.char_length + 1
158
+ tags << {
159
+ :hashtag => hash_text,
160
+ :indices => [start_position, position]
161
+ }
63
162
  end
64
- tags.each{|tag| yield tag } if block_given?
163
+ tags.each{|tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
65
164
  tags
66
165
  end
67
-
68
166
  end
69
167
  end
@@ -65,7 +65,7 @@ module Twitter
65
65
  end
66
66
  placed = true
67
67
  end
68
-
68
+
69
69
  # ultimate fallback, hits that run off the end get a closing tag
70
70
  if !placed
71
71
  result << tag
@@ -82,6 +82,8 @@ module Twitter
82
82
  end
83
83
 
84
84
  result
85
+ rescue
86
+ text
85
87
  end
86
88
  end
87
89
  end
data/lib/regex.rb CHANGED
@@ -1,3 +1,4 @@
1
+ # coding: UTF-8
1
2
 
2
3
  module Twitter
3
4
  # A collection of regular expressions for parsing Tweet text. The regular expression
@@ -30,7 +31,7 @@ module Twitter
30
31
  REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(?=(.|$))/o
31
32
  REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
32
33
 
33
- REGEXEN[:list_name] = /^[a-zA-Z\x80-\xff].{0,79}$/
34
+ REGEXEN[:list_name] = /^[a-zA-Z\u0080-\u00ff].{0,79}$/
34
35
 
35
36
  # Latin accented characters (subtracted 0xD7 from the range, it's a confusable multiplication sign. Looks like "x")
36
37
  LATIN_ACCENTS = [(0xc0..0xd6).to_a, (0xd8..0xf6).to_a, (0xf8..0xff).to_a].flatten.pack('U*').freeze
@@ -39,17 +40,24 @@ module Twitter
39
40
  # Characters considered valid in a hashtag but not at the beginning, where only a-z and 0-9 are valid.
40
41
  HASHTAG_CHARACTERS = /[a-z0-9_#{LATIN_ACCENTS}]/io
41
42
  REGEXEN[:auto_link_hashtags] = /(^|[^0-9A-Z&\/]+)(#|#)([0-9A-Z_]*[A-Z_]+#{HASHTAG_CHARACTERS}*)/io
42
- REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\x80-\xff\-]{0,79})?/
43
+ REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9\u0080-\u00ff\-]{0,79})?/
43
44
  REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\&lt;\|:~\(|\}:o\{|:\-\[|\&gt;o\&lt;|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
44
45
 
45
46
  # URL related hash regex collection
46
47
  REGEXEN[:valid_preceding_chars] = /(?:[^\/"':!=]|^|\:)/
47
48
  REGEXEN[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i
48
- REGEXEN[:valid_url_path_chars] = /[\.\,]?[a-z0-9!\*'\(\);:=\+\$\/%#\[\]\-_,~@]/i
49
+
50
+ # Allow URL paths to contain balanced parens
51
+ # 1. Used in Wikipedia URLs like /Primer_(film)
52
+ # 2. Used in IIS sessions like /S(dfd346)/
53
+ REGEXEN[:wikipedia_disambiguation] = /(?:\([^\)]+\))/i
54
+ REGEXEN[:valid_url_path_chars] = /(?:
55
+ #{REGEXEN[:wikipedia_disambiguation]}|
56
+ [\.\,]?[a-z0-9!\*';:=\+\$\/%#\[\]\-_,~@]
57
+ )/ix
49
58
  # Valid end-of-path chracters (so /foo. does not gobble the period).
50
- # 1. Allow ) for Wikipedia URLs.
51
- # 2. Allow =&# for empty URL parameters and other URL-join artifacts
52
- REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9\)=#\/]/i
59
+ # 1. Allow =&# for empty URL parameters and other URL-join artifacts
60
+ REGEXEN[:valid_url_path_ending_chars] = /[a-z0-9=#\/]/i
53
61
  REGEXEN[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
54
62
  REGEXEN[:valid_url_query_ending_chars] = /[a-z0-9_&=#]/i
55
63
  REGEXEN[:valid_url] = %r{
@@ -58,7 +66,9 @@ module Twitter
58
66
  ( # $3 URL
59
67
  (https?:\/\/|www\.) # $4 Protocol or beginning
60
68
  (#{REGEXEN[:valid_domain]}) # $5 Domain(s) and optional post number
61
- (/#{REGEXEN[:valid_url_path_chars]}*#{REGEXEN[:valid_url_path_ending_chars]}?)? # $6 URL Path
69
+ (/#{REGEXEN[:valid_url_path_chars]}*
70
+ #{REGEXEN[:valid_url_path_ending_chars]}?
71
+ )? # $6 URL Path
62
72
  (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $7 Query String
63
73
  )
64
74
  )
data/lib/twitter-text.rb CHANGED
@@ -1,4 +1,4 @@
1
- raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE)
1
+ raise("twitter-text requires the $KCODE variable be set to 'UTF8' or 'u'") unless ['u','UTF8'].include?($KCODE) || ''.respond_to?(:codepoints)
2
2
 
3
3
  require 'rubygems'
4
4
 
@@ -1,4 +1,6 @@
1
- require File.dirname(__FILE__) + '/spec_helper'
1
+ #encoding: UTF-8
2
+ # require File.dirname(__FILE__) + '/spec_helper'
3
+ require 'spec_helper'
2
4
 
3
5
  class TestAutolink
4
6
  include Twitter::Autolink
@@ -303,7 +305,7 @@ describe Twitter::Autolink do
303
305
 
304
306
  it "should be linked" do
305
307
  link = Hpricot(@autolinked_text).at('a')
306
- link.inner_text.should == "#{[0xFF03].pack('U')}twj_dev"
308
+ (link.inner_text.respond_to?(:force_encoding) ? link.inner_text.force_encoding("utf-8") : link.inner_text).should == "#{[0xFF03].pack('U')}twj_dev"
307
309
  link['href'].should == 'http://twitter.com/search?q=%23twj_dev'
308
310
  end
309
311
  end
@@ -329,7 +331,7 @@ describe Twitter::Autolink do
329
331
  end
330
332
  end
331
333
 
332
- context "when surrounded by parentheses;" do
334
+ context "with a path surrounded by parentheses;" do
333
335
  def original_text; "I found a neatness (#{url})"; end
334
336
 
335
337
  it "should be linked" do
@@ -340,10 +342,49 @@ describe Twitter::Autolink do
340
342
  def url; "http://www.google.com/"; end
341
343
 
342
344
  it "should be linked" do
343
- pending # our support for Wikipedia URLS containing parens breaks this corner case
344
345
  @autolinked_text.should have_autolinked_url(url)
345
346
  end
346
347
  end
348
+
349
+ context "when the URL has a path;" do
350
+ def url; "http://www.google.com/fsdfasdf"; end
351
+
352
+ it "should be linked" do
353
+ @autolinked_text.should have_autolinked_url(url)
354
+ end
355
+ end
356
+ end
357
+
358
+ context "when path contains parens" do
359
+ def original_text; "I found a neatness (#{url})"; end
360
+
361
+ it "should be linked" do
362
+ @autolinked_text.should have_autolinked_url(url)
363
+ end
364
+
365
+ context "wikipedia" do
366
+ def url; "http://en.wikipedia.org/wiki/Madonna_(artist)"; end
367
+
368
+ it "should be linked" do
369
+ @autolinked_text.should have_autolinked_url(url)
370
+ end
371
+ end
372
+
373
+ context "IIS session" do
374
+ def url; "http://msdn.com/S(deadbeef)/page.htm"; end
375
+
376
+ it "should be linked" do
377
+ @autolinked_text.should have_autolinked_url(url)
378
+ end
379
+ end
380
+
381
+ context "unbalanced parens" do
382
+ def url; "http://example.com/i_has_a_("; end
383
+
384
+ it "should be linked" do
385
+ @autolinked_text.should have_autolinked_url("http://example.com/i_has_a_")
386
+ end
387
+ end
347
388
  end
348
389
 
349
390
  context "when preceded by a :" do
@@ -1,3 +1,4 @@
1
+ #encoding: UTF-8
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  class TestExtractor
@@ -49,6 +50,58 @@ describe Twitter::Extractor do
49
50
  end
50
51
  end
51
52
 
53
+ describe "mentions with indices" do
54
+ context "single screen name alone " do
55
+ it "should be linked and the correct indices" do
56
+ @extractor.extract_mentioned_screen_names_with_indices("@alice").should == [{:screen_name => "alice", :indices => [0, 6]}]
57
+ end
58
+
59
+ it "should be linked with _ and the correct indices" do
60
+ @extractor.extract_mentioned_screen_names_with_indices("@alice_adams").should == [{:screen_name => "alice_adams", :indices => [0, 12]}]
61
+ end
62
+
63
+ it "should be linked if numeric and the correct indices" do
64
+ @extractor.extract_mentioned_screen_names_with_indices("@1234").should == [{:screen_name => "1234", :indices => [0, 5]}]
65
+ end
66
+ end
67
+
68
+ context "multiple screen names" do
69
+ it "should both be linked with the correct indices" do
70
+ @extractor.extract_mentioned_screen_names_with_indices("@alice @bob").should ==
71
+ [{:screen_name => "alice", :indices => [0, 6]},
72
+ {:screen_name => "bob", :indices => [7, 11]}]
73
+ end
74
+
75
+ it "should be linked with the correct indices even when repeated" do
76
+ @extractor.extract_mentioned_screen_names_with_indices("@alice @alice @bob").should ==
77
+ [{:screen_name => "alice", :indices => [0, 6]},
78
+ {:screen_name => "alice", :indices => [7, 13]},
79
+ {:screen_name => "bob", :indices => [14, 18]}]
80
+ end
81
+ end
82
+
83
+ context "screen names embedded in text" do
84
+ it "should be linked in Latin text with the correct indices" do
85
+ @extractor.extract_mentioned_screen_names_with_indices("waiting for @alice to arrive").should == [{:screen_name => "alice", :indices => [12, 18]}]
86
+ end
87
+
88
+ it "should be linked in Japanese text with the correct indices" do
89
+ @extractor.extract_mentioned_screen_names_with_indices("の@aliceに到着を待っている").should == [{:screen_name => "alice", :indices => [1, 7]}]
90
+ end
91
+ end
92
+
93
+ it "should accept a block arugment and call it in order" do
94
+ needed = [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}]
95
+ @extractor.extract_mentioned_screen_names_with_indices("@alice @bob") do |sn, start_index, end_index|
96
+ data = needed.shift
97
+ sn.should == data[:screen_name]
98
+ start_index.should == data[:indices].first
99
+ end_index.should == data[:indices].last
100
+ end
101
+ needed.should == []
102
+ end
103
+ end
104
+
52
105
  describe "replies" do
53
106
  context "should be extracted from" do
54
107
  it "should extract from lone name" do
@@ -118,6 +171,37 @@ describe Twitter::Extractor do
118
171
  end
119
172
  end
120
173
 
174
+ describe "urls with indices" do
175
+ describe "matching URLS" do
176
+ TestUrls::VALID.each do |url|
177
+ it "should extract the URL #{url} and prefix it with a protocol if missing" do
178
+ extracted_urls = @extractor.extract_urls_with_indices(url)
179
+ extracted_urls.size.should == 1
180
+ extracted_url = extracted_urls.first
181
+ extracted_url[:url].should include(url)
182
+ extracted_url[:indices].first.should == 0
183
+ extracted_url[:indices].last.should == url.chars.to_a.size
184
+ end
185
+
186
+ it "should match the URL #{url} when it's embedded in other text" do
187
+ text = "Sweet url: #{url} I found. #awesome"
188
+ extracted_urls = @extractor.extract_urls_with_indices(text)
189
+ extracted_urls.size.should == 1
190
+ extracted_url = extracted_urls.first
191
+ extracted_url[:url].should include(url)
192
+ extracted_url[:indices].first.should == 11
193
+ extracted_url[:indices].last.should == 11 + url.chars.to_a.size
194
+ end
195
+ end
196
+ end
197
+
198
+ describe "invalid URLS" do
199
+ it "does not link urls with invalid domains" do
200
+ @extractor.extract_urls_with_indices("http://tld-too-short.x").should == []
201
+ end
202
+ end
203
+ end
204
+
121
205
  describe "hashtags" do
122
206
  context "extracts latin/numeric hashtags" do
123
207
  %w(text text123 123text).each do |hashtag|
@@ -163,7 +247,6 @@ describe Twitter::Extractor do
163
247
  end
164
248
  end
165
249
  end
166
-
167
250
  end
168
251
 
169
252
  it "should not extract numeric hashtags" do
@@ -171,4 +254,69 @@ describe Twitter::Extractor do
171
254
  end
172
255
  end
173
256
 
257
+ describe "hashtags with indices" do
258
+ def match_hashtag_in_text(hashtag, text, offset = 0)
259
+ extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
260
+ extracted_hashtags.size.should == 1
261
+ extracted_hashtag = extracted_hashtags.first
262
+ extracted_hashtag[:hashtag].should == hashtag
263
+ extracted_hashtag[:indices].first.should == offset
264
+ extracted_hashtag[:indices].last.should == offset + hashtag.chars.to_a.size + 1
265
+ end
266
+
267
+ def no_match_hashtag_in_text(text)
268
+ extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
269
+ extracted_hashtags.size.should == 0
270
+ end
271
+
272
+ context "extracts latin/numeric hashtags" do
273
+ %w(text text123 123text).each do |hashtag|
274
+ it "should extract ##{hashtag}" do
275
+ match_hashtag_in_text(hashtag, "##{hashtag}")
276
+ end
277
+
278
+ it "should extract ##{hashtag} within text" do
279
+ match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
280
+ end
281
+ end
282
+ end
283
+
284
+ context "international hashtags" do
285
+ context "should allow accents" do
286
+ %w(mañana café münchen).each do |hashtag|
287
+ it "should extract ##{hashtag}" do
288
+ match_hashtag_in_text(hashtag, "##{hashtag}")
289
+ end
290
+
291
+ it "should extract ##{hashtag} within text" do
292
+ match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
293
+ end
294
+ end
295
+
296
+ it "should not allow the multiplication character" do
297
+ match_hashtag_in_text('pre', "#pre#{[0xd7].pack('U')}post")
298
+ end
299
+
300
+ it "should not allow the division character" do
301
+ match_hashtag_in_text('pre', "#pre#{[0xf7].pack('U')}post")
302
+ end
303
+ end
304
+
305
+ context "should NOT allow Japanese" do
306
+ %w(会議中 ハッシュ).each do |hashtag|
307
+ it "should NOT extract ##{hashtag}" do
308
+ no_match_hashtag_in_text("##{hashtag}")
309
+ end
310
+
311
+ it "should NOT extract ##{hashtag} within text" do
312
+ no_match_hashtag_in_text("pre-text ##{hashtag} post-text")
313
+ end
314
+ end
315
+ end
316
+ end
317
+
318
+ it "should not extract numeric hashtags" do
319
+ no_match_hashtag_in_text("#1234")
320
+ end
321
+ end
174
322
  end
@@ -1,3 +1,4 @@
1
+ #encoding: UTF-8
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  class TestHitHighlighter
data/spec/regex_spec.rb CHANGED
@@ -1,3 +1,4 @@
1
+ #encoding: UTF-8
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  describe "Twitter::Regex regular expressions" do
data/spec/test_urls.rb CHANGED
@@ -1,3 +1,4 @@
1
+ #encoding: UTF-8
1
2
  module TestUrls
2
3
  VALID = [
3
4
  "http://google.com",
@@ -20,6 +21,8 @@ module TestUrls
20
21
  "http://search.twitter.com/search?q=avro&lang=en",
21
22
  "http://mrs.domain-dash.biz",
22
23
  "http://x.com/has/one/char/domain",
24
+ "http://t.co/nwcLTFF",
25
+ # "t.co/nwcLTFF"
23
26
  ]
24
27
 
25
28
  INVALID = [
data/spec/unicode_spec.rb CHANGED
@@ -1,3 +1,4 @@
1
+ #encoding: UTF-8
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  describe Twitter::Unicode do
@@ -1,3 +1,4 @@
1
+ #encoding: BINARY
1
2
  require File.dirname(__FILE__) + '/spec_helper'
2
3
 
3
4
  class TestValidation
metadata CHANGED
@@ -1,36 +1,45 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twitter-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.2
4
+ prerelease: false
5
+ segments:
6
+ - 1
7
+ - 1
8
+ - 4
9
+ version: 1.1.4
5
10
  platform: ruby
6
11
  authors:
7
12
  - Matt Sanford
8
13
  - Patrick Ewing
9
14
  - Ben Cherry
10
15
  - Britt Selvitelle
16
+ - Raffi Krikorian
11
17
  autorequire: ""
12
18
  bindir: bin
13
19
  cert_chain: []
14
20
 
15
- date: 2010-05-11 00:00:00 -07:00
21
+ date: 2010-07-08 00:00:00 -07:00
16
22
  default_executable:
17
23
  dependencies:
18
24
  - !ruby/object:Gem::Dependency
19
25
  name: actionpack
20
- type: :runtime
21
- version_requirement:
22
- version_requirements: !ruby/object:Gem::Requirement
26
+ prerelease: false
27
+ requirement: &id001 !ruby/object:Gem::Requirement
23
28
  requirements:
24
29
  - - ">="
25
30
  - !ruby/object:Gem::Version
31
+ segments:
32
+ - 0
26
33
  version: "0"
27
- version:
34
+ type: :runtime
35
+ version_requirements: *id001
28
36
  description: A gem that provides text handling for Twitter
29
37
  email:
30
38
  - matt@twitter.com
31
39
  - patrick.henry.ewing@gmail.com
32
40
  - bcherry@gmail.com
33
41
  - bs@brittspace.com
42
+ - raffi@twitter.com
34
43
  executables: []
35
44
 
36
45
  extensions: []
@@ -70,18 +79,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
70
79
  requirements:
71
80
  - - ">="
72
81
  - !ruby/object:Gem::Version
82
+ segments:
83
+ - 0
73
84
  version: "0"
74
- version:
75
85
  required_rubygems_version: !ruby/object:Gem::Requirement
76
86
  requirements:
77
87
  - - ">="
78
88
  - !ruby/object:Gem::Version
89
+ segments:
90
+ - 0
79
91
  version: "0"
80
- version:
81
92
  requirements: []
82
93
 
83
94
  rubyforge_project:
84
- rubygems_version: 1.3.5
95
+ rubygems_version: 1.3.6
85
96
  signing_key:
86
97
  specification_version: 3
87
98
  summary: Twitter text handling library