RubyGems - twitter-text-editted - Versions diffs - 1.0.0 - Mend

twitter-text-editted 1.0.0

Files changed (35) hide show

checksums.yaml +15 -0
data/.gemtest +0 -0
data/.gitignore +40 -0
data/.gitmodules +3 -0
data/.rspec +2 -0
data/.travis.yml +4 -0
data/Gemfile +4 -0
data/LICENSE +188 -0
data/README.rdoc +112 -0
data/Rakefile +64 -0
data/lib/twitter-text.rb +22 -0
data/lib/twitter-text/autolink.rb +443 -0
data/lib/twitter-text/deprecation.rb +15 -0
data/lib/twitter-text/extractor.rb +328 -0
data/lib/twitter-text/hash_helper.rb +21 -0
data/lib/twitter-text/hit_highlighter.rb +86 -0
data/lib/twitter-text/regex.rb +366 -0
data/lib/twitter-text/rewriter.rb +59 -0
data/lib/twitter-text/unicode.rb +26 -0
data/lib/twitter-text/validation.rb +113 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/spec/autolinking_spec.rb +826 -0
data/spec/extractor_spec.rb +368 -0
data/spec/hithighlighter_spec.rb +92 -0
data/spec/regex_spec.rb +38 -0
data/spec/rewriter_spec.rb +548 -0
data/spec/spec_helper.rb +127 -0
data/spec/test_urls.rb +80 -0
data/spec/twitter_text_spec.rb +21 -0
data/spec/unicode_spec.rb +31 -0
data/spec/validation_spec.rb +43 -0
data/test/conformance_test.rb +182 -0
data/twitter-text.gemspec +29 -0
metadata +193 -0

@@ -0,0 +1,368 @@
+# encoding: utf-8
+require File.dirname(__FILE__) + '/spec_helper'
+class TestExtractor
+  include Twitter::Extractor
+end
+describe Twitter::Extractor do
+  before do
+    @extractor = TestExtractor.new
+  end
+  describe "mentions" do
+    context "single screen name alone " do
+      it "should be linked" do
+        @extractor.extract_mentioned_screen_names("@alice").should == ["alice"]
+      end
+      it "should be linked with _" do
+        @extractor.extract_mentioned_screen_names("@alice_adams").should == ["alice_adams"]
+      end
+      it "should be linked if numeric" do
+        @extractor.extract_mentioned_screen_names("@1234").should == ["1234"]
+      end
+    end
+    context "multiple screen names" do
+      it "should both be linked" do
+        @extractor.extract_mentioned_screen_names("@alice @bob").should == ["alice", "bob"]
+      end
+    end
+    context "screen names embedded in text" do
+      it "should be linked in Latin text" do
+        @extractor.extract_mentioned_screen_names("waiting for @alice to arrive").should == ["alice"]
+      end
+      it "should be linked in Japanese text" do
+        @extractor.extract_mentioned_screen_names("の@aliceに到着を待っている").should == ["alice"]
+      end
+      it "should ignore mentions preceded by !, @, #, $, %, & or *" do
+        invalid_chars = ['!', '@', '#', '$', '%', '&', '*']
+        invalid_chars.each do |c|
+          @extractor.extract_mentioned_screen_names("f#{c}@kn").should == []
+        end
+      end
+    end
+    it "should accept a block arugment and call it in order" do
+      needed = ["alice", "bob"]
+      @extractor.extract_mentioned_screen_names("@alice @bob") do |sn|
+        sn.should == needed.shift
+      end
+      needed.should == []
+    end
+  end
+  describe "mentions with indices" do
+    context "single screen name alone " do
+      it "should be linked and the correct indices" do
+        @extractor.extract_mentioned_screen_names_with_indices("@alice").should == [{:screen_name => "alice", :indices => [0, 6]}]
+      end
+      it "should be linked with _ and the correct indices" do
+        @extractor.extract_mentioned_screen_names_with_indices("@alice_adams").should == [{:screen_name => "alice_adams", :indices => [0, 12]}]
+      end
+      it "should be linked if numeric and the correct indices" do
+        @extractor.extract_mentioned_screen_names_with_indices("@1234").should == [{:screen_name => "1234", :indices => [0, 5]}]
+      end
+    end
+    context "multiple screen names" do
+      it "should both be linked with the correct indices" do
+        @extractor.extract_mentioned_screen_names_with_indices("@alice @bob").should ==
+          [{:screen_name => "alice", :indices => [0, 6]},
+           {:screen_name => "bob", :indices => [7, 11]}]
+      end
+      it "should be linked with the correct indices even when repeated" do
+        @extractor.extract_mentioned_screen_names_with_indices("@alice @alice @bob").should ==
+          [{:screen_name => "alice", :indices => [0, 6]},
+           {:screen_name => "alice", :indices => [7, 13]},
+           {:screen_name => "bob", :indices => [14, 18]}]
+      end
+    end
+    context "screen names embedded in text" do
+      it "should be linked in Latin text with the correct indices" do
+        @extractor.extract_mentioned_screen_names_with_indices("waiting for @alice to arrive").should == [{:screen_name => "alice", :indices => [12, 18]}]
+      end
+      it "should be linked in Japanese text with the correct indices" do
+        @extractor.extract_mentioned_screen_names_with_indices("の@aliceに到着を待っている").should == [{:screen_name => "alice", :indices => [1, 7]}]
+      end
+    end
+    it "should accept a block arugment and call it in order" do
+      needed = [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}]
+      @extractor.extract_mentioned_screen_names_with_indices("@alice @bob") do |sn, start_index, end_index|
+        data = needed.shift
+        sn.should == data[:screen_name]
+        start_index.should == data[:indices].first
+        end_index.should == data[:indices].last
+      end
+      needed.should == []
+    end
+    it "should extract screen name in text with supplementary character" do
+      @extractor.extract_mentioned_screen_names_with_indices("#{[0x10400].pack('U')} @alice").should == [{:screen_name => "alice", :indices => [2, 8]}]
+    end
+  end
+  describe "replies" do
+    context "should be extracted from" do
+      it "should extract from lone name" do
+        @extractor.extract_reply_screen_name("@alice").should == "alice"
+      end
+      it "should extract from the start" do
+        @extractor.extract_reply_screen_name("@alice reply text").should == "alice"
+      end
+      it "should extract preceded by a space" do
+        @extractor.extract_reply_screen_name(" @alice reply text").should == "alice"
+      end
+      it "should extract preceded by a full-width space" do
+        @extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text").should == "alice"
+      end
+    end
+    context "should not be extracted from" do
+      it "should not be extracted when preceded by text" do
+        @extractor.extract_reply_screen_name("reply @alice text").should == nil
+      end
+      it "should not be extracted when preceded by puctuation" do
+        %w(. / _ - + # ! @).each do |punct|
+          @extractor.extract_reply_screen_name("#{punct}@alice text").should == nil
+        end
+      end
+    end
+    context "should accept a block arugment" do
+      it "should call the block on match" do
+        @extractor.extract_reply_screen_name("@alice") do |sn|
+          sn.should == "alice"
+        end
+      end
+      it "should not call the block on no match" do
+        calls = 0
+        @extractor.extract_reply_screen_name("not a reply") do |sn|
+          calls += 1
+        end
+        calls.should == 0
+      end
+    end
+  end
+  describe "urls" do
+    describe "matching URLS" do
+      TestUrls::VALID.each do |url|
+        it "should extract the URL #{url} and prefix it with a protocol if missing" do
+          @extractor.extract_urls(url).first.should include(url)
+        end
+        it "should match the URL #{url} when it's embedded in other text" do
+          text = "Sweet url: #{url} I found. #awesome"
+          @extractor.extract_urls(text).first.should include(url)
+        end
+      end
+    end
+    describe "invalid URLS" do
+      it "does not link urls with invalid domains" do
+        @extractor.extract_urls("http://tld-too-short.x").should == []
+      end
+    end
+    describe "t.co URLS" do
+      TestUrls::TCO.each do |url|
+        it "should only extract the t.co URL from the URL #{url}" do
+          extracted_urls = @extractor.extract_urls(url)
+          extracted_urls.size.should == 1
+          extracted_url = extracted_urls.first
+          extracted_url.should_not == url
+          extracted_url.should == url[0...20]
+        end
+        it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
+          text = "Sweet url: #{url} I found. #awesome"
+          extracted_urls = @extractor.extract_urls(text)
+          extracted_urls.size.should == 1
+          extracted_url = extracted_urls.first
+          extracted_url.should_not == url
+          extracted_url.should == url[0...20]
+        end
+      end
+    end
+  end
+  describe "urls with indices" do
+    describe "matching URLS" do
+      TestUrls::VALID.each do |url|
+        it "should extract the URL #{url} and prefix it with a protocol if missing" do
+          extracted_urls = @extractor.extract_urls_with_indices(url)
+          extracted_urls.size.should == 1
+          extracted_url = extracted_urls.first
+          extracted_url[:url].should include(url)
+          extracted_url[:indices].first.should == 0
+          extracted_url[:indices].last.should == url.chars.to_a.size
+        end
+        it "should match the URL #{url} when it's embedded in other text" do
+          text = "Sweet url: #{url} I found. #awesome"
+          extracted_urls = @extractor.extract_urls_with_indices(text)
+          extracted_urls.size.should == 1
+          extracted_url = extracted_urls.first
+          extracted_url[:url].should include(url)
+          extracted_url[:indices].first.should == 11
+          extracted_url[:indices].last.should == 11 + url.chars.to_a.size
+        end
+      end
+      it "should extract URL in text with supplementary character" do
+        @extractor.extract_urls_with_indices("#{[0x10400].pack('U')} http://twitter.com").should == [{:url => "http://twitter.com", :indices => [2, 20]}]
+      end
+    end
+    describe "invalid URLS" do
+      it "does not link urls with invalid domains" do
+        @extractor.extract_urls_with_indices("http://tld-too-short.x").should == []
+      end
+    end
+    describe "t.co URLS" do
+      TestUrls::TCO.each do |url|
+        it "should only extract the t.co URL from the URL #{url} and adjust indices correctly" do
+          extracted_urls = @extractor.extract_urls_with_indices(url)
+          extracted_urls.size.should == 1
+          extracted_url = extracted_urls.first
+          extracted_url[:url].should_not include(url)
+          extracted_url[:url].should include(url[0...20])
+          extracted_url[:indices].first.should == 0
+          extracted_url[:indices].last.should == 20
+        end
+        it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
+          text = "Sweet url: #{url} I found. #awesome"
+          extracted_urls = @extractor.extract_urls_with_indices(text)
+          extracted_urls.size.should == 1
+          extracted_url = extracted_urls.first
+          extracted_url[:url].should_not include(url)
+          extracted_url[:url].should include(url[0...20])
+          extracted_url[:indices].first.should == 11
+          extracted_url[:indices].last.should == 31
+        end
+      end
+    end
+  end
+  describe "hashtags" do
+    context "extracts latin/numeric hashtags" do
+      %w(text text123 123text).each do |hashtag|
+        it "should extract ##{hashtag}" do
+          @extractor.extract_hashtags("##{hashtag}").should == [hashtag]
+        end
+        it "should extract ##{hashtag} within text" do
+          @extractor.extract_hashtags("pre-text ##{hashtag} post-text").should == [hashtag]
+        end
+      end
+    end
+    context "international hashtags" do
+      context "should allow accents" do
+        %w(mañana café münchen).each do |hashtag|
+          it "should extract ##{hashtag}" do
+            @extractor.extract_hashtags("##{hashtag}").should == [hashtag]
+          end
+          it "should extract ##{hashtag} within text" do
+            @extractor.extract_hashtags("pre-text ##{hashtag} post-text").should == [hashtag]
+          end
+        end
+        it "should not allow the multiplication character" do
+          @extractor.extract_hashtags("#pre#{Twitter::Unicode::U00D7}post").should == ["pre"]
+        end
+        it "should not allow the division character" do
+          @extractor.extract_hashtags("#pre#{Twitter::Unicode::U00F7}post").should == ["pre"]
+        end
+      end
+    end
+    it "should not extract numeric hashtags" do
+      @extractor.extract_hashtags("#1234").should == []
+    end
+    it "should extract hashtag followed by punctuations" do
+      @extractor.extract_hashtags("#test1: #test2; #test3\"").should == ["test1", "test2" ,"test3"]
+    end
+  end
+  describe "hashtags with indices" do
+    def match_hashtag_in_text(hashtag, text, offset = 0)
+      extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
+      extracted_hashtags.size.should == 1
+      extracted_hashtag = extracted_hashtags.first
+      extracted_hashtag[:hashtag].should == hashtag
+      extracted_hashtag[:indices].first.should == offset
+      extracted_hashtag[:indices].last.should == offset + hashtag.chars.to_a.size + 1
+    end
+    def not_match_hashtag_in_text(text)
+      extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
+      extracted_hashtags.size.should == 0
+    end
+    context "extracts latin/numeric hashtags" do
+      %w(text text123 123text).each do |hashtag|
+        it "should extract ##{hashtag}" do
+          match_hashtag_in_text(hashtag, "##{hashtag}")
+        end
+        it "should extract ##{hashtag} within text" do
+          match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
+        end
+      end
+    end
+    context "international hashtags" do
+      context "should allow accents" do
+        %w(mañana café münchen).each do |hashtag|
+          it "should extract ##{hashtag}" do
+            match_hashtag_in_text(hashtag, "##{hashtag}")
+          end
+          it "should extract ##{hashtag} within text" do
+            match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
+          end
+        end
+        it "should not allow the multiplication character" do
+          match_hashtag_in_text("pre", "#pre#{[0xd7].pack('U')}post", 0)
+        end
+        it "should not allow the division character" do
+          match_hashtag_in_text("pre", "#pre#{[0xf7].pack('U')}post", 0)
+        end
+      end
+    end
+    it "should not extract numeric hashtags" do
+      not_match_hashtag_in_text("#1234")
+    end
+    it "should extract hashtag in text with supplementary character" do
+      match_hashtag_in_text("hashtag", "#{[0x10400].pack('U')} #hashtag", 2)
+    end
+  end
+end

data/spec/hithighlighter_spec.rb ADDED

@@ -0,0 +1,92 @@
+# encoding: utf-8
+require File.dirname(__FILE__) + '/spec_helper'
+class TestHitHighlighter
+  include Twitter::HitHighlighter
+end
+describe Twitter::HitHighlighter do
+  describe "highlight" do
+    before do
+      @highlighter = TestHitHighlighter.new
+    end
+    context "with options" do
+      before do
+        @original = "Testing this hit highliter"
+        @hits = [[13,16]]
+      end
+      it "should default to <em> tags" do
+        @highlighter.hit_highlight(@original, @hits).should == "Testing this <em>hit</em> highliter"
+      end
+      it "should allow tag override" do
+        @highlighter.hit_highlight(@original, @hits, :tag => 'b').should == "Testing this <b>hit</b> highliter"
+      end
+    end
+    context "without links" do
+      before do
+        @original = "Hey! this is a test tweet"
+      end
+      it "should return original when no hits are provided" do
+        @highlighter.hit_highlight(@original).should == @original
+      end
+      it "should highlight one hit" do
+        @highlighter.hit_highlight(@original, hits = [[5, 9]]).should == "Hey! <em>this</em> is a test tweet"
+      end
+      it "should highlight two hits" do
+        @highlighter.hit_highlight(@original, hits = [[5, 9], [15, 19]]).should == "Hey! <em>this</em> is a <em>test</em> tweet"
+      end
+      it "should correctly highlight first-word hits" do
+        @highlighter.hit_highlight(@original, hits = [[0, 3]]).should == "<em>Hey</em>! this is a test tweet"
+      end
+      it "should correctly highlight last-word hits" do
+        @highlighter.hit_highlight(@original, hits = [[20, 25]]).should == "Hey! this is a test <em>tweet</em>"
+      end
+    end
+    context "with links" do
+      it "should highlight with a single link" do
+        @highlighter.hit_highlight("@<a>bcherry</a> this was a test tweet", [[9, 13]]).should == "@<a>bcherry</a> <em>this</em> was a test tweet"
+      end
+      it "should highlight with link at the end" do
+        @highlighter.hit_highlight("test test <a>test</a>", [[5, 9]]).should == "test <em>test</em> <a>test</a>"
+      end
+      it "should highlight with a link at the beginning" do
+        @highlighter.hit_highlight("<a>test</a> test test", [[5, 9]]).should == "<a>test</a> <em>test</em> test"
+      end
+      it "should highlight an entire link" do
+        @highlighter.hit_highlight("test <a>test</a> test", [[5, 9]]).should == "test <a><em>test</em></a> test"
+      end
+      it "should highlight within a link" do
+        @highlighter.hit_highlight("test <a>test</a> test", [[6, 8]]).should == "test <a>t<em>es</em>t</a> test"
+      end
+      it "should highlight around a link" do
+        @highlighter.hit_highlight("test <a>test</a> test", [[3, 11]]).should == "tes<em>t <a>test</a> t</em>est"
+      end
+      it "should fail gracefully with bad hits" do
+        @highlighter.hit_highlight("test test", [[5, 20]]).should == "test <em>test</em>"
+      end
+      it "should not mess up with touching tags" do
+        @highlighter.hit_highlight("<a>foo</a><a>foo</a>", [[3,6]]).should == "<a>foo</a><a><em>foo</em></a>"
+      end
+    end
+  end
+end