twitter-text 1.4.13 → 1.4.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/extractor.rb +5 -0
 - data/lib/regex.rb +7 -4
 - data/spec/extractor_spec.rb +46 -0
 - data/spec/test_urls.rb +24 -1
 - data/twitter-text.gemspec +1 -1
 - metadata +4 -4
 
    
        data/lib/extractor.rb
    CHANGED
    
    | 
         @@ -190,6 +190,11 @@ module Twitter 
     | 
|
| 
       190 
190 
     | 
    
         
             
                        last_url[:indices][1] = end_position
         
     | 
| 
       191 
191 
     | 
    
         
             
                      end
         
     | 
| 
       192 
192 
     | 
    
         
             
                    else
         
     | 
| 
      
 193 
     | 
    
         
            +
                      # In the case of t.co URLs, don't allow additional path characters
         
     | 
| 
      
 194 
     | 
    
         
            +
                      if url =~ Twitter::Regex[:valid_tco_url]
         
     | 
| 
      
 195 
     | 
    
         
            +
                        url = $&
         
     | 
| 
      
 196 
     | 
    
         
            +
                        end_position = start_position + url.char_length
         
     | 
| 
      
 197 
     | 
    
         
            +
                      end
         
     | 
| 
       193 
198 
     | 
    
         
             
                      urls << {
         
     | 
| 
       194 
199 
     | 
    
         
             
                        :url => url,
         
     | 
| 
       195 
200 
     | 
    
         
             
                        :indices => [start_position, end_position]
         
     | 
    
        data/lib/regex.rb
    CHANGED
    
    | 
         @@ -106,14 +106,14 @@ module Twitter 
     | 
|
| 
       106 
106 
     | 
    
         | 
| 
       107 
107 
     | 
    
         
             
                REGEXEN[:auto_link_hashtags] = /#{HASHTAG}/io
         
     | 
| 
       108 
108 
     | 
    
         
             
                # Used in Extractor and Rewriter for final filtering
         
     | 
| 
       109 
     | 
    
         
            -
                REGEXEN[:end_hashtag_match] =  
     | 
| 
      
 109 
     | 
    
         
            +
                REGEXEN[:end_hashtag_match] = /\A(?:[##]|:\/\/)/o
         
     | 
| 
       110 
110 
     | 
    
         | 
| 
       111 
111 
     | 
    
         
             
                REGEXEN[:at_signs] = /[@@]/
         
     | 
| 
       112 
112 
     | 
    
         
             
                REGEXEN[:extract_mentions] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
         
     | 
| 
       113 
113 
     | 
    
         
             
                REGEXEN[:extract_mentions_or_lists] = /(^|[^a-zA-Z0-9_])#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
         
     | 
| 
       114 
114 
     | 
    
         
             
                REGEXEN[:extract_reply] = /^(?:#{REGEXEN[:spaces]})*#{REGEXEN[:at_signs]}([a-zA-Z0-9_]{1,20})/o
         
     | 
| 
       115 
115 
     | 
    
         
             
                # Used in Extractor and Rewriter for final filtering
         
     | 
| 
       116 
     | 
    
         
            -
                REGEXEN[:end_screen_name_match] =  
     | 
| 
      
 116 
     | 
    
         
            +
                REGEXEN[:end_screen_name_match] = /\A(?:#{REGEXEN[:at_signs]}|#{REGEXEN[:latin_accents]}|:\/\/)/o
         
     | 
| 
       117 
117 
     | 
    
         | 
| 
       118 
118 
     | 
    
         
             
                REGEXEN[:auto_link_usernames_or_lists] = /([^a-zA-Z0-9_]|^|RT:?)([@@]+)([a-zA-Z0-9_]{1,20})(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?/o
         
     | 
| 
       119 
119 
     | 
    
         
             
                REGEXEN[:auto_link_emoticon] = /(8\-\#|8\-E|\+\-\(|\`\@|\`O|\<\|:~\(|\}:o\{|:\-\[|\>o\<|X\-\/|\[:-\]\-I\-|\/\/\/\/Ö\\\\\\\\|\(\|:\|\/\)|∑:\*\)|\( \| \))/
         
     | 
| 
         @@ -125,7 +125,7 @@ module Twitter 
     | 
|
| 
       125 
125 
     | 
    
         
             
                REGEXEN[:valid_subdomain] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[_-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
         
     | 
| 
       126 
126 
     | 
    
         
             
                REGEXEN[:valid_domain_name] = /(?:(?:#{DOMAIN_VALID_CHARS}(?:[-]|#{DOMAIN_VALID_CHARS})*)?#{DOMAIN_VALID_CHARS}\.)/io
         
     | 
| 
       127 
127 
     | 
    
         | 
| 
       128 
     | 
    
         
            -
                REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)(?=[^a-z]|$))/i
         
     | 
| 
      
 128 
     | 
    
         
            +
                REGEXEN[:valid_gTLD] = /(?:(?:aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|xxx)(?=[^a-z]|$))/i
         
     | 
| 
       129 
129 
     | 
    
         
             
                REGEXEN[:valid_ccTLD] = %r{
         
     | 
| 
       130 
130 
     | 
    
         
             
                  (?:
         
     | 
| 
       131 
131 
     | 
    
         
             
                    (?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|
         
     | 
| 
         @@ -150,8 +150,11 @@ module Twitter 
     | 
|
| 
       150 
150 
     | 
    
         
             
                  (?:#{REGEXEN[:valid_gTLD]}|#{REGEXEN[:valid_ccTLD]}|#{REGEXEN[:valid_punycode]})
         
     | 
| 
       151 
151 
     | 
    
         
             
                /iox
         
     | 
| 
       152 
152 
     | 
    
         | 
| 
      
 153 
     | 
    
         
            +
                # This is used in Extractor for stricter t.co URL extraction
         
     | 
| 
      
 154 
     | 
    
         
            +
                REGEXEN[:valid_tco_url] = /^https?:\/\/t\.co\/[a-z0-9]+/i
         
     | 
| 
      
 155 
     | 
    
         
            +
             
     | 
| 
       153 
156 
     | 
    
         
             
                # This is used in Extractor to filter out unwanted URLs.
         
     | 
| 
       154 
     | 
    
         
            -
                REGEXEN[:invalid_short_domain] =  
     | 
| 
      
 157 
     | 
    
         
            +
                REGEXEN[:invalid_short_domain] = /\A#{REGEXEN[:valid_domain_name]}#{REGEXEN[:valid_ccTLD]}\Z/io
         
     | 
| 
       155 
158 
     | 
    
         | 
| 
       156 
159 
     | 
    
         
             
                REGEXEN[:valid_port_number] = /[0-9]+/
         
     | 
| 
       157 
160 
     | 
    
         | 
    
        data/spec/extractor_spec.rb
    CHANGED
    
    | 
         @@ -169,6 +169,27 @@ describe Twitter::Extractor do 
     | 
|
| 
       169 
169 
     | 
    
         
             
                    @extractor.extract_urls("http://tld-too-short.x").should == []
         
     | 
| 
       170 
170 
     | 
    
         
             
                  end
         
     | 
| 
       171 
171 
     | 
    
         
             
                end
         
     | 
| 
      
 172 
     | 
    
         
            +
             
     | 
| 
      
 173 
     | 
    
         
            +
                describe "t.co URLS" do
         
     | 
| 
      
 174 
     | 
    
         
            +
                  TestUrls::TCO.each do |url|
         
     | 
| 
      
 175 
     | 
    
         
            +
                    it "should only extract the t.co URL from the URL #{url}" do
         
     | 
| 
      
 176 
     | 
    
         
            +
                      extracted_urls = @extractor.extract_urls(url)
         
     | 
| 
      
 177 
     | 
    
         
            +
                      extracted_urls.size.should == 1
         
     | 
| 
      
 178 
     | 
    
         
            +
                      extracted_url = extracted_urls.first
         
     | 
| 
      
 179 
     | 
    
         
            +
                      extracted_url.should_not == url
         
     | 
| 
      
 180 
     | 
    
         
            +
                      extracted_url.should == url[0...20]
         
     | 
| 
      
 181 
     | 
    
         
            +
                    end
         
     | 
| 
      
 182 
     | 
    
         
            +
             
     | 
| 
      
 183 
     | 
    
         
            +
                    it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
         
     | 
| 
      
 184 
     | 
    
         
            +
                      text = "Sweet url: #{url} I found. #awesome"
         
     | 
| 
      
 185 
     | 
    
         
            +
                      extracted_urls = @extractor.extract_urls(text)
         
     | 
| 
      
 186 
     | 
    
         
            +
                      extracted_urls.size.should == 1
         
     | 
| 
      
 187 
     | 
    
         
            +
                      extracted_url = extracted_urls.first
         
     | 
| 
      
 188 
     | 
    
         
            +
                      extracted_url.should_not == url
         
     | 
| 
      
 189 
     | 
    
         
            +
                      extracted_url.should == url[0...20]
         
     | 
| 
      
 190 
     | 
    
         
            +
                    end
         
     | 
| 
      
 191 
     | 
    
         
            +
                  end
         
     | 
| 
      
 192 
     | 
    
         
            +
                end
         
     | 
| 
       172 
193 
     | 
    
         
             
              end
         
     | 
| 
       173 
194 
     | 
    
         | 
| 
       174 
195 
     | 
    
         
             
              describe "urls with indices" do
         
     | 
| 
         @@ -200,6 +221,31 @@ describe Twitter::Extractor do 
     | 
|
| 
       200 
221 
     | 
    
         
             
                    @extractor.extract_urls_with_indices("http://tld-too-short.x").should == []
         
     | 
| 
       201 
222 
     | 
    
         
             
                  end
         
     | 
| 
       202 
223 
     | 
    
         
             
                end
         
     | 
| 
      
 224 
     | 
    
         
            +
             
     | 
| 
      
 225 
     | 
    
         
            +
                describe "t.co URLS" do
         
     | 
| 
      
 226 
     | 
    
         
            +
                  TestUrls::TCO.each do |url|
         
     | 
| 
      
 227 
     | 
    
         
            +
                    it "should only extract the t.co URL from the URL #{url} and adjust indices correctly" do
         
     | 
| 
      
 228 
     | 
    
         
            +
                      extracted_urls = @extractor.extract_urls_with_indices(url)
         
     | 
| 
      
 229 
     | 
    
         
            +
                      extracted_urls.size.should == 1
         
     | 
| 
      
 230 
     | 
    
         
            +
                      extracted_url = extracted_urls.first
         
     | 
| 
      
 231 
     | 
    
         
            +
                      extracted_url[:url].should_not include(url)
         
     | 
| 
      
 232 
     | 
    
         
            +
                      extracted_url[:url].should include(url[0...20])
         
     | 
| 
      
 233 
     | 
    
         
            +
                      extracted_url[:indices].first.should == 0
         
     | 
| 
      
 234 
     | 
    
         
            +
                      extracted_url[:indices].last.should == 20
         
     | 
| 
      
 235 
     | 
    
         
            +
                    end
         
     | 
| 
      
 236 
     | 
    
         
            +
             
     | 
| 
      
 237 
     | 
    
         
            +
                    it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
         
     | 
| 
      
 238 
     | 
    
         
            +
                      text = "Sweet url: #{url} I found. #awesome"
         
     | 
| 
      
 239 
     | 
    
         
            +
                      extracted_urls = @extractor.extract_urls_with_indices(text)
         
     | 
| 
      
 240 
     | 
    
         
            +
                      extracted_urls.size.should == 1
         
     | 
| 
      
 241 
     | 
    
         
            +
                      extracted_url = extracted_urls.first
         
     | 
| 
      
 242 
     | 
    
         
            +
                      extracted_url[:url].should_not include(url)
         
     | 
| 
      
 243 
     | 
    
         
            +
                      extracted_url[:url].should include(url[0...20])
         
     | 
| 
      
 244 
     | 
    
         
            +
                      extracted_url[:indices].first.should == 11
         
     | 
| 
      
 245 
     | 
    
         
            +
                      extracted_url[:indices].last.should == 31
         
     | 
| 
      
 246 
     | 
    
         
            +
                    end
         
     | 
| 
      
 247 
     | 
    
         
            +
                  end
         
     | 
| 
      
 248 
     | 
    
         
            +
                end
         
     | 
| 
       203 
249 
     | 
    
         
             
              end
         
     | 
| 
       204 
250 
     | 
    
         | 
| 
       205 
251 
     | 
    
         
             
              describe "hashtags" do
         
     | 
    
        data/spec/test_urls.rb
    CHANGED
    
    | 
         @@ -52,6 +52,29 @@ module TestUrls 
     | 
|
| 
       52 
52 
     | 
    
         
             
                "http://twitt#{[0x202B].pack('U')}er.com",
         
     | 
| 
       53 
53 
     | 
    
         
             
                "http://twitt#{[0x202C].pack('U')}er.com",
         
     | 
| 
       54 
54 
     | 
    
         
             
                "http://twitt#{[0x202D].pack('U')}er.com",
         
     | 
| 
       55 
     | 
    
         
            -
                "http://twitt#{[0x202E].pack('U')}er.com" 
     | 
| 
      
 55 
     | 
    
         
            +
                "http://twitt#{[0x202E].pack('U')}er.com"
         
     | 
| 
       56 
56 
     | 
    
         
             
              ] unless defined?(TestUrls::INVALID)
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
              TCO = [
         
     | 
| 
      
 59 
     | 
    
         
            +
                "http://t.co/P53cv5yO!",
         
     | 
| 
      
 60 
     | 
    
         
            +
                "http://t.co/fQJmiPGg***",
         
     | 
| 
      
 61 
     | 
    
         
            +
                "http://t.co/pbY2NfTZ's",
         
     | 
| 
      
 62 
     | 
    
         
            +
                "http://t.co/2vYHpAc5;",
         
     | 
| 
      
 63 
     | 
    
         
            +
                "http://t.co/ulYGBYSo:",
         
     | 
| 
      
 64 
     | 
    
         
            +
                "http://t.co/GeT4bSiw=win",
         
     | 
| 
      
 65 
     | 
    
         
            +
                "http://t.co/8MkmHU0k+fun",
         
     | 
| 
      
 66 
     | 
    
         
            +
                "http://t.co/TKLp64dY.yes,",
         
     | 
| 
      
 67 
     | 
    
         
            +
                "http://t.co/8vuO27cI$$",
         
     | 
| 
      
 68 
     | 
    
         
            +
                "http://t.co/rPYTvdA8/",
         
     | 
| 
      
 69 
     | 
    
         
            +
                "http://t.co/WvtMw5ku%",
         
     | 
| 
      
 70 
     | 
    
         
            +
                "http://t.co/8t7G3ddS#",
         
     | 
| 
      
 71 
     | 
    
         
            +
                "http://t.co/nfHNJDV2/#!",
         
     | 
| 
      
 72 
     | 
    
         
            +
                "http://t.co/gK6NOXHs[good]",
         
     | 
| 
      
 73 
     | 
    
         
            +
                "http://t.co/dMrT0o1Y]bad",
         
     | 
| 
      
 74 
     | 
    
         
            +
                "http://t.co/FNkPfmii-",
         
     | 
| 
      
 75 
     | 
    
         
            +
                "http://t.co/sMgS3pjI_oh",
         
     | 
| 
      
 76 
     | 
    
         
            +
                "http://t.co/F8Dq3Plb~",
         
     | 
| 
      
 77 
     | 
    
         
            +
                "http://t.co/ivvH58vC&help",
         
     | 
| 
      
 78 
     | 
    
         
            +
                "http://t.co/iUBL15zD|NZ5KYLQ8"
         
     | 
| 
      
 79 
     | 
    
         
            +
              ] unless defined?(TestUrls::TCO)
         
     | 
| 
       57 
80 
     | 
    
         
             
            end
         
     | 
    
        data/twitter-text.gemspec
    CHANGED
    
    | 
         @@ -2,7 +2,7 @@ 
     | 
|
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            Gem::Specification.new do |s|
         
     | 
| 
       4 
4 
     | 
    
         
             
              s.name = "twitter-text"
         
     | 
| 
       5 
     | 
    
         
            -
              s.version = "1.4. 
     | 
| 
      
 5 
     | 
    
         
            +
              s.version = "1.4.14"
         
     | 
| 
       6 
6 
     | 
    
         
             
              s.authors = ["Matt Sanford", "Patrick Ewing", "Ben Cherry", "Britt Selvitelle",
         
     | 
| 
       7 
7 
     | 
    
         
             
                           "Raffi Krikorian", "J.P. Cummins", "Yoshimasa Niwa", "Keita Fujii"]
         
     | 
| 
       8 
8 
     | 
    
         
             
              s.email = ["matt@twitter.com", "patrick.henry.ewing@gmail.com", "bcherry@gmail.com", "bs@brittspace.com",
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,13 +1,13 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification 
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: twitter-text
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version 
         
     | 
| 
       4 
     | 
    
         
            -
              hash:  
     | 
| 
      
 4 
     | 
    
         
            +
              hash: 27
         
     | 
| 
       5 
5 
     | 
    
         
             
              prerelease: 
         
     | 
| 
       6 
6 
     | 
    
         
             
              segments: 
         
     | 
| 
       7 
7 
     | 
    
         
             
              - 1
         
     | 
| 
       8 
8 
     | 
    
         
             
              - 4
         
     | 
| 
       9 
     | 
    
         
            -
              -  
     | 
| 
       10 
     | 
    
         
            -
              version: 1.4. 
     | 
| 
      
 9 
     | 
    
         
            +
              - 14
         
     | 
| 
      
 10 
     | 
    
         
            +
              version: 1.4.14
         
     | 
| 
       11 
11 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       12 
12 
     | 
    
         
             
            authors: 
         
     | 
| 
       13 
13 
     | 
    
         
             
            - Matt Sanford
         
     | 
| 
         @@ -22,7 +22,7 @@ autorequire: 
     | 
|
| 
       22 
22 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       23 
23 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       24 
24 
     | 
    
         | 
| 
       25 
     | 
    
         
            -
            date: 2011- 
     | 
| 
      
 25 
     | 
    
         
            +
            date: 2011-12-01 00:00:00 -08:00
         
     | 
| 
       26 
26 
     | 
    
         
             
            default_executable: 
         
     | 
| 
       27 
27 
     | 
    
         
             
            dependencies: 
         
     | 
| 
       28 
28 
     | 
    
         
             
            - !ruby/object:Gem::Dependency 
         
     |