RubyGems - twitter-text-simpleidn - Versions diffs - 3.0.0.0 - Mend

twitter-text-simpleidn 3.0.0.0

Files changed (44) hide show

checksums.yaml +7 -0
data/.gemtest +0 -0
data/.gitignore +40 -0
data/.gitmodules +3 -0
data/.rspec +2 -0
data/CHANGELOG.md +35 -0
data/Gemfile +4 -0
data/LICENSE +188 -0
data/README.md +193 -0
data/Rakefile +52 -0
data/config/README.md +142 -0
data/config/v1.json +8 -0
data/config/v2.json +29 -0
data/config/v3.json +30 -0
data/lib/assets/tld_lib.yml +1571 -0
data/lib/twitter-text.rb +29 -0
data/lib/twitter-text/autolink.rb +453 -0
data/lib/twitter-text/configuration.rb +68 -0
data/lib/twitter-text/deprecation.rb +21 -0
data/lib/twitter-text/emoji_regex.rb +27 -0
data/lib/twitter-text/extractor.rb +388 -0
data/lib/twitter-text/hash_helper.rb +27 -0
data/lib/twitter-text/hit_highlighter.rb +92 -0
data/lib/twitter-text/regex.rb +381 -0
data/lib/twitter-text/rewriter.rb +69 -0
data/lib/twitter-text/unicode.rb +31 -0
data/lib/twitter-text/validation.rb +251 -0
data/lib/twitter-text/weighted_range.rb +24 -0
data/script/destroy +14 -0
data/script/generate +14 -0
data/spec/autolinking_spec.rb +848 -0
data/spec/configuration_spec.rb +136 -0
data/spec/extractor_spec.rb +392 -0
data/spec/hithighlighter_spec.rb +96 -0
data/spec/regex_spec.rb +76 -0
data/spec/rewriter_spec.rb +553 -0
data/spec/spec_helper.rb +139 -0
data/spec/test_urls.rb +90 -0
data/spec/twitter_text_spec.rb +25 -0
data/spec/unicode_spec.rb +35 -0
data/spec/validation_spec.rb +87 -0
data/test/conformance_test.rb +242 -0
data/twitter-text.gemspec +35 -0
metadata +229 -0

@@ -0,0 +1,136 @@
+# Copyright 2018 Twitter, Inc.
+# Licensed under the Apache License, Version 2.0
+# http://www.apache.org/licenses/LICENSE-2.0
+# encoding: utf-8
+require File.dirname(__FILE__) + '/spec_helper'
+describe Twitter::TwitterText::Configuration do
+  context "configuration" do
+    context "with invalid data" do
+      it "should raise an exception" do
+        invalid_hash = Twitter::TwitterText::Configuration.parse_string("{\"version\":2,\"maxWeightedTweetLength\":280,\"scale\":100,\"defaultWeight\":200,\"transformedURLLength\":23,\"ranges\":[{\"start\":0,\"end\":true,\"weight\":false},{\"start\":8192,\"end\":8205,\"weight\":100},{\"start\":8208,\"end\":8223,\"weight\":100},{\"start\":8242,\"end\":8247,\"weight\":100}]}")
+        expect { Twitter::TwitterText::Configuration.new(invalid_hash) }.to raise_error(ArgumentError)
+      end
+    end
+    context "with defaults" do
+      before do
+        Twitter::TwitterText::Configuration.default_configuration = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V2)
+      end
+      it "should define version constants" do
+        expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V1)).to be true
+        expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V2)).to be true
+        expect(Twitter::TwitterText::Configuration.const_defined?(:CONFIG_V3)).to be true
+      end
+      it "should define a default configuration" do
+        expect(Twitter::TwitterText::Configuration.default_configuration).to_not be_nil
+        expect(Twitter::TwitterText::Configuration.default_configuration.version).to eq(2)
+      end
+    end
+    context "with v1 configuration" do
+      before do
+        @config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V1)
+      end
+      it "should have a version" do
+        expect(@config.version).to eq(1)
+      end
+      it "should have a max_weighted_tweet_length" do
+        expect(@config.max_weighted_tweet_length).to eq(140)
+      end
+      it "should have a scale" do
+        expect(@config.scale).to eq(1)
+      end
+      it "should have a default_weight" do
+        expect(@config.default_weight).to eq(1)
+      end
+      it "should have a transformed_url_length" do
+        expect(@config.transformed_url_length).to eq(23)
+      end
+    end
+    context "with v2 configuration" do
+      before do
+        @config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V2)
+      end
+      it "should have a version" do
+        expect(@config.version).to eq(2)
+      end
+      it "should have a max_weighted_tweet_length" do
+        expect(@config.max_weighted_tweet_length).to eq(280)
+      end
+      it "should have a scale" do
+        expect(@config.scale).to eq(100)
+      end
+      it "should have a default_weight" do
+        expect(@config.default_weight).to eq(200)
+      end
+      it "should have a transformed_url_length" do
+        expect(@config.transformed_url_length).to eq(23)
+      end
+      it "should have a configured range" do
+        expect(@config.ranges).to be_kind_of(Array)
+        expect(@config.ranges.count).to be > 0
+        expect(@config.ranges[0]).to be_kind_of(Twitter::TwitterText::WeightedRange)
+        weighted_range = @config.ranges[0]
+        expect(weighted_range.start).to be_kind_of(Integer)
+        expect(weighted_range.end).to be_kind_of(Integer)
+        expect(weighted_range.weight).to be_kind_of(Integer)
+      end
+    end
+    context "with v3 configuration" do
+      before do
+        @config = Twitter::TwitterText::Configuration.configuration_from_file(Twitter::TwitterText::Configuration::CONFIG_V3)
+      end
+      it "should have a version" do
+        expect(@config.version).to eq(3)
+      end
+      it "should have a max_weighted_tweet_length" do
+        expect(@config.max_weighted_tweet_length).to eq(280)
+      end
+      it "should have a scale" do
+        expect(@config.scale).to eq(100)
+      end
+      it "should have a default_weight" do
+        expect(@config.default_weight).to eq(200)
+      end
+      it "should have a transformed_url_length" do
+        expect(@config.transformed_url_length).to eq(23)
+      end
+      it "should have a configured range" do
+        expect(@config.ranges).to be_kind_of(Array)
+        expect(@config.ranges.count).to be > 0
+        expect(@config.ranges[0]).to be_kind_of(Twitter::TwitterText::WeightedRange)
+        weighted_range = @config.ranges[0]
+        expect(weighted_range.start).to be_kind_of(Integer)
+        expect(weighted_range.end).to be_kind_of(Integer)
+        expect(weighted_range.weight).to be_kind_of(Integer)
+      end
+      it "should support discounting emoji" do
+        expect(@config.emoji_parsing_enabled).to be true
+      end
+    end
+  end
+end

data/spec/extractor_spec.rb ADDED

@@ -0,0 +1,392 @@
+# Copyright 2018 Twitter, Inc.
+# Licensed under the Apache License, Version 2.0
+# http://www.apache.org/licenses/LICENSE-2.0
+# encoding: utf-8
+require File.dirname(__FILE__) + '/spec_helper'
+class TestExtractor
+  include Twitter::TwitterText::Extractor
+end
+describe Twitter::TwitterText::Extractor do
+  before do
+    @extractor = TestExtractor.new
+  end
+  describe "mentions" do
+    context "single screen name alone " do
+      it "should be linked" do
+        expect(@extractor.extract_mentioned_screen_names("@alice")).to be == ["alice"]
+      end
+      it "should be linked with _" do
+        expect(@extractor.extract_mentioned_screen_names("@alice_adams")).to be == ["alice_adams"]
+      end
+      it "should be linked if numeric" do
+        expect(@extractor.extract_mentioned_screen_names("@1234")).to be == ["1234"]
+      end
+    end
+    context "multiple screen names" do
+      it "should both be linked" do
+        expect(@extractor.extract_mentioned_screen_names("@alice @bob")).to be == ["alice", "bob"]
+      end
+    end
+    context "screen names embedded in text" do
+      it "should be linked in Latin text" do
+        expect(@extractor.extract_mentioned_screen_names("waiting for @alice to arrive")).to be == ["alice"]
+      end
+      it "should be linked in Japanese text" do
+        expect(@extractor.extract_mentioned_screen_names("の@aliceに到着を待っている")).to be == ["alice"]
+      end
+      it "should ignore mentions preceded by !, @, #, $, %, & or *" do
+        invalid_chars = ['!', '@', '#', '$', '%', '&', '*']
+        invalid_chars.each do |c|
+          expect(@extractor.extract_mentioned_screen_names("f#{c}@kn")).to be == []
+        end
+      end
+    end
+    it "should accept a block arugment and call it in order" do
+      needed = ["alice", "bob"]
+      @extractor.extract_mentioned_screen_names("@alice @bob") do |sn|
+        expect(sn).to be == needed.shift
+      end
+      expect(needed).to be == []
+    end
+  end
+  describe "mentions with indices" do
+    context "single screen name alone " do
+      it "should be linked and the correct indices" do
+        expect(@extractor.extract_mentioned_screen_names_with_indices("@alice")).to be == [{:screen_name => "alice", :indices => [0, 6]}]
+      end
+      it "should be linked with _ and the correct indices" do
+        expect(@extractor.extract_mentioned_screen_names_with_indices("@alice_adams")).to be == [{:screen_name => "alice_adams", :indices => [0, 12]}]
+      end
+      it "should be linked if numeric and the correct indices" do
+        expect(@extractor.extract_mentioned_screen_names_with_indices("@1234")).to be == [{:screen_name => "1234", :indices => [0, 5]}]
+      end
+    end
+    context "multiple screen names" do
+      it "should both be linked with the correct indices" do
+        expect(@extractor.extract_mentioned_screen_names_with_indices("@alice @bob")).to be ==
+          [{:screen_name => "alice", :indices => [0, 6]},
+           {:screen_name => "bob", :indices => [7, 11]}]
+      end
+      it "should be linked with the correct indices even when repeated" do
+        expect(@extractor.extract_mentioned_screen_names_with_indices("@alice @alice @bob")).to be ==
+          [{:screen_name => "alice", :indices => [0, 6]},
+           {:screen_name => "alice", :indices => [7, 13]},
+           {:screen_name => "bob", :indices => [14, 18]}]
+      end
+    end
+    context "screen names embedded in text" do
+      it "should be linked in Latin text with the correct indices" do
+        expect(@extractor.extract_mentioned_screen_names_with_indices("waiting for @alice to arrive")).to be == [{:screen_name => "alice", :indices => [12, 18]}]
+      end
+      it "should be linked in Japanese text with the correct indices" do
+        expect(@extractor.extract_mentioned_screen_names_with_indices("の@aliceに到着を待っている")).to be == [{:screen_name => "alice", :indices => [1, 7]}]
+      end
+    end
+    it "should accept a block arugment and call it in order" do
+      needed = [{:screen_name => "alice", :indices => [0, 6]}, {:screen_name => "bob", :indices => [7, 11]}]
+      @extractor.extract_mentioned_screen_names_with_indices("@alice @bob") do |sn, start_index, end_index|
+        data = needed.shift
+        expect(sn).to be == data[:screen_name]
+        expect(start_index).to be == data[:indices].first
+        expect(end_index).to be == data[:indices].last
+      end
+      expect(needed).to be == []
+    end
+    it "should extract screen name in text with supplementary character" do
+      expect(@extractor.extract_mentioned_screen_names_with_indices("#{[0x10400].pack('U')} @alice")).to be == [{:screen_name => "alice", :indices => [2, 8]}]
+    end
+  end
+  describe "replies" do
+    context "should be extracted from" do
+      it "should extract from lone name" do
+        expect(@extractor.extract_reply_screen_name("@alice")).to be == "alice"
+      end
+      it "should extract from the start" do
+        expect(@extractor.extract_reply_screen_name("@alice reply text")).to be == "alice"
+      end
+      it "should extract preceded by a space" do
+        expect(@extractor.extract_reply_screen_name(" @alice reply text")).to be == "alice"
+      end
+      it "should extract preceded by a full-width space" do
+        expect(@extractor.extract_reply_screen_name("#{[0x3000].pack('U')}@alice reply text")).to be == "alice"
+      end
+    end
+    context "should not be extracted from" do
+      it "should not be extracted when preceded by text" do
+        expect(@extractor.extract_reply_screen_name("reply @alice text")).to be == nil
+      end
+      it "should not be extracted when preceded by puctuation" do
+        %w(. / _ - + # ! @).each do |punct|
+          expect(@extractor.extract_reply_screen_name("#{punct}@alice text")).to be == nil
+        end
+      end
+    end
+    context "should accept a block arugment" do
+      it "should call the block on match" do
+        @extractor.extract_reply_screen_name("@alice") do |sn|
+          expect(sn).to be == "alice"
+        end
+      end
+      it "should not call the block on no match" do
+        calls = 0
+        @extractor.extract_reply_screen_name("not a reply") do |sn|
+          calls += 1
+        end
+        expect(calls).to be == 0
+      end
+    end
+  end
+  describe "urls" do
+    describe "matching URLS" do
+      TestUrls::VALID.each do |url|
+        it "should extract the URL #{url} and prefix it with a protocol if missing" do
+          expect(@extractor.extract_urls(url).first).to include(url)
+        end
+        it "should match the URL #{url} when it's embedded in other text" do
+          text = "Sweet url: #{url} I found. #awesome"
+          expect(@extractor.extract_urls(text).first).to include(url)
+        end
+      end
+    end
+    describe "invalid URLS" do
+      TestUrls::INVALID.each do |url|
+        it "does not extract URL from #{url}" do
+          expect(@extractor.extract_urls(url).first).to be nil
+        end
+      end
+    end
+    describe "t.co URLS" do
+      TestUrls::TCO.each do |url|
+        it "should only extract the t.co URL from the URL #{url}" do
+          extracted_urls = @extractor.extract_urls(url)
+          expect(extracted_urls.size).to be == 1
+          extracted_url = extracted_urls.first
+          expect(extracted_url).to_not be == url
+          expect(extracted_url).to be == url[0...20]
+        end
+        it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
+          text = "Sweet url: #{url} I found. #awesome"
+          extracted_urls = @extractor.extract_urls(text)
+          expect(extracted_urls.size).to be == 1
+          extracted_url = extracted_urls.first
+          expect(extracted_url).to_not be == url
+          expect(extracted_url).to be == url[0...20]
+        end
+      end
+    end
+  end
+  describe "urls with indices" do
+    describe "matching URLS" do
+      TestUrls::VALID.each do |url|
+        it "should extract the URL #{url} and prefix it with a protocol if missing" do
+          extracted_urls = @extractor.extract_urls_with_indices(url)
+          expect(extracted_urls.size).to be == 1
+          extracted_url = extracted_urls.first
+          expect(extracted_url[:url]).to include(url)
+          expect(extracted_url[:indices].first).to be == 0
+          expect(extracted_url[:indices].last).to be == url.chars.to_a.size
+        end
+        it "should match the URL #{url} when it's embedded in other text" do
+          text = "Sweet url: #{url} I found. #awesome"
+          extracted_urls = @extractor.extract_urls_with_indices(text)
+          expect(extracted_urls.size).to be == 1
+          extracted_url = extracted_urls.first
+          expect(extracted_url[:url]).to include(url)
+          expect(extracted_url[:indices].first).to be == 11
+          expect(extracted_url[:indices].last).to be == 11 + url.chars.to_a.size
+        end
+      end
+      it "should extract URL in text with supplementary character" do
+        expect(@extractor.extract_urls_with_indices("#{[0x10400].pack('U')} http://twitter.com")).to be == [{:url => "http://twitter.com", :indices => [2, 20]}]
+      end
+    end
+    describe "invalid URLS" do
+      it "does not link urls with invalid domains" do
+        expect(@extractor.extract_urls_with_indices("http://tld-too-short.x")).to be == []
+      end
+      it "does not consider a long URL with protocol to be valid" do
+        # maximum length of domain label is 32 chars.
+        url = ("a" * 31) + "."
+        url *= (Twitter::TwitterText::Extractor::MAX_URL_LENGTH / 32)
+        url = "https://" + url + "com" # longer than 4096 (MAX_URL_LENGTH) chars
+        expect(@extractor.is_valid_domain(url.length, url, true)).to be false
+      end
+      it "does not consider a long URL without protocol to be valid" do
+        # maximum length of domain label is 32 chars.
+        url = ("a" * 31) + "."
+        url *= ((Twitter::TwitterText::Extractor::MAX_URL_LENGTH / 32) - 1)
+        url = url + "com" # shorter than 4096 (MAX_URL_LENGTH) chars
+        expect(@extractor.is_valid_domain(url.length, url, false)).to be true
+        url = ("a" * (31 - "https://".length)) + "." + url
+        expect(@extractor.is_valid_domain(url.length, url, false)).to be false
+      end
+    end
+    describe "t.co URLS" do
+      TestUrls::TCO.each do |url|
+        it "should only extract the t.co URL from the URL #{url} and adjust indices correctly" do
+          extracted_urls = @extractor.extract_urls_with_indices(url)
+          expect(extracted_urls.size).to be == 1
+          extracted_url = extracted_urls.first
+          expect(extracted_url[:url]).to_not include(url)
+          expect(extracted_url[:url]).to include(url[0...20])
+          expect(extracted_url[:indices].first).to be == 0
+          expect(extracted_url[:indices].last).to be == 20
+        end
+        it "should match the t.co URL from the URL #{url} when it's embedded in other text" do
+          text = "Sweet url: #{url} I found. #awesome"
+          extracted_urls = @extractor.extract_urls_with_indices(text)
+          expect(extracted_urls.size).to be == 1
+          extracted_url = extracted_urls.first
+          expect(extracted_url[:url]).to_not include(url)
+          expect(extracted_url[:url]).to include(url[0...20])
+          expect(extracted_url[:indices].first).to be == 11
+          expect(extracted_url[:indices].last).to be == 31
+        end
+      end
+    end
+  end
+  describe "hashtags" do
+    context "extracts latin/numeric hashtags" do
+      %w(text text123 123text).each do |hashtag|
+        it "should extract ##{hashtag}" do
+          expect(@extractor.extract_hashtags("##{hashtag}")).to be == [hashtag]
+        end
+        it "should extract ##{hashtag} within text" do
+          expect(@extractor.extract_hashtags("pre-text ##{hashtag} post-text")).to be == [hashtag]
+        end
+      end
+    end
+    context "international hashtags" do
+      context "should allow accents" do
+        %w(mañana café münchen).each do |hashtag|
+          it "should extract ##{hashtag}" do
+            expect(@extractor.extract_hashtags("##{hashtag}")).to be == [hashtag]
+          end
+          it "should extract ##{hashtag} within text" do
+            expect(@extractor.extract_hashtags("pre-text ##{hashtag} post-text")).to be == [hashtag]
+          end
+        end
+        it "should not allow the multiplication character" do
+          expect(@extractor.extract_hashtags("#pre#{Twitter::TwitterText::Unicode::U00D7}post")).to be == ["pre"]
+        end
+        it "should not allow the division character" do
+          expect(@extractor.extract_hashtags("#pre#{Twitter::TwitterText::Unicode::U00F7}post")).to be == ["pre"]
+        end
+      end
+    end
+    it "should not extract numeric hashtags" do
+      expect(@extractor.extract_hashtags("#1234")).to be == []
+    end
+    it "should extract hashtag followed by punctuations" do
+      expect(@extractor.extract_hashtags("#test1: #test2; #test3\"")).to be == ["test1", "test2" ,"test3"]
+    end
+  end
+  describe "hashtags with indices" do
+    def match_hashtag_in_text(hashtag, text, offset = 0)
+      extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
+      expect(extracted_hashtags.size).to be == 1
+      extracted_hashtag = extracted_hashtags.first
+      expect(extracted_hashtag[:hashtag]).to be == hashtag
+      expect(extracted_hashtag[:indices].first).to be == offset
+      expect(extracted_hashtag[:indices].last).to be == offset + hashtag.chars.to_a.size + 1
+    end
+    def not_match_hashtag_in_text(text)
+      extracted_hashtags = @extractor.extract_hashtags_with_indices(text)
+      expect(extracted_hashtags.size).to be == 0
+    end
+    context "extracts latin/numeric hashtags" do
+      %w(text text123 123text).each do |hashtag|
+        it "should extract ##{hashtag}" do
+          match_hashtag_in_text(hashtag, "##{hashtag}")
+        end
+        it "should extract ##{hashtag} within text" do
+          match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
+        end
+      end
+    end
+    context "international hashtags" do
+      context "should allow accents" do
+        %w(mañana café münchen).each do |hashtag|
+          it "should extract ##{hashtag}" do
+            match_hashtag_in_text(hashtag, "##{hashtag}")
+          end
+          it "should extract ##{hashtag} within text" do
+            match_hashtag_in_text(hashtag, "pre-text ##{hashtag} post-text", 9)
+          end
+        end
+        it "should not allow the multiplication character" do
+          match_hashtag_in_text("pre", "#pre#{[0xd7].pack('U')}post", 0)
+        end
+        it "should not allow the division character" do
+          match_hashtag_in_text("pre", "#pre#{[0xf7].pack('U')}post", 0)
+        end
+      end
+    end
+    it "should not extract numeric hashtags" do
+      not_match_hashtag_in_text("#1234")
+    end
+    it "should extract hashtag in text with supplementary character" do
+      match_hashtag_in_text("hashtag", "#{[0x10400].pack('U')} #hashtag", 2)
+    end
+  end
+end